unofficial mirror of emacs-devel@gnu.org 
 help / color / mirror / code / Atom feed
* Re: Tree-sitter integration on feature/tree-sitter
@ 2022-05-19  1:35 Kiong-Ge Liau
  2022-05-20  2:01 ` Yuan Fu
  0 siblings, 1 reply; 187+ messages in thread
From: Kiong-Ge Liau @ 2022-05-19  1:35 UTC (permalink / raw)
  To: casouri, emacs-devel

Can you please share the mentioned "treesit-demo.el" file? I cannot see
it attached to any messages on emacs-devel mailing list. 

Thanks.






^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-19  1:35 Tree-sitter integration on feature/tree-sitter Kiong-Ge Liau
@ 2022-05-20  2:01 ` Yuan Fu
  2022-06-16 19:03   ` Yuan Fu
  0 siblings, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-05-20  2:01 UTC (permalink / raw)
  To: Kiong-Ge Liau; +Cc: emacs-devel

[-- Attachment #1: Type: text/plain, Size: 275 bytes --]



> On May 18, 2022, at 6:35 PM, Kiong-Ge Liau <lkg.ch@pm.me> wrote:
> 
> Can you please share the mentioned "treesit-demo.el" file? I cannot see
> it attached to any messages on emacs-devel mailing list. 
> 
> Thanks.
> 

Here it is, take it with a grain of salt :-)

Yuan


[-- Attachment #2: treesit-demo.el --]
[-- Type: application/octet-stream, Size: 6624 bytes --]

;; -*- lexical-binding: t -*-

(require 'treesit)
(require 'pp)
(require 'rx)

(defun treesit-show-buffer-tree ()
  "Show the AST in a popup buffer."
  (interactive)
  (let ((root-node (treesit-parser-root-node
                    (treesit-get-parser-create 'c)))
        (pp-use-max-width t))
    (pop-to-buffer (get-buffer-create "*treesit-show-tree*"))
    (erase-buffer)
    (insert (treesit-node-string root-node))
    ;; Format the output.
    (goto-char (point-min))
    (while (re-search-forward (rx (or "(" (seq (+ word) ":"))) nil t)
      (goto-char (match-beginning 0))
      (insert "\n")
      (goto-char (1+ (match-end 0))))
    (setq indent-line-function #'lisp-indent-line)
    (indent-region (point-min) (point-max))))

(defun ts-c-fontify-system-lib (beg end _)
  "Fortify a #include <lib>.
Fortify the angled brackets in preprocessor-face,
and the lib name in string-face."
  (put-text-property beg (1+ beg) 'face 'font-lock-preprocessor-face)
  (put-text-property (1- end) end 'face 'font-lock-preprocessor-face)
  (put-text-property (1+ beg) (1- end)
                     'face 'font-lock-string-face))

;; Please compiler.
(defvar ts-c-treesit-indent-rules)
(define-derived-mode ts-c-mode prog-mode "TS C"
  "C mode with tree-sitter support."
  (if (treesit-should-enable-p)
      (progn
        (setq-local treesit-font-lock-defaults
                    '((ts-c-treesit-settings-1))

                    font-lock-defaults
                    '(nil t)

                    indent-line-function
                    #'treesit-indent

                    treesit-simple-indent-rules
                    ts-c-treesit-indent-rules)
        (treesit-font-lock-enable))
    ;; Copied from cc-mode.
    (setq-local font-lock-defaults
                '((c-font-lock-keywords
                   c-font-lock-keywords-1
                   c-font-lock-keywords-2
                   c-font-lock-keywords-3)
                  nil nil
                  ((95 . "w")
                   (36 . "w"))
                  c-beginning-of-syntax
                  (font-lock-mark-block-function . c-mark-function)))))

(defvar ts-c-treesit-indent-rules
  `((c
     ;; Empty line.
     (no-node prev-line 0)

     ;; Function/struct definition body {}.
     ((match nil "function_definition" "body") parent 0)
     ((node-is "field_declaration_list") parent 0)

     ;; Call expression.
     ((parent-is "call_expression") parent 2)

     ;; If-else.
     ((match nil "if_statement" "condition") parent 2)
     ((match nil "if_statement" "consequence") parent 2)
     ((match nil "if_statement" "alternative") parent 2)
     ((match nil "switch_statement" "condition")  parent 2)
     ((node-is "else") parent 0)

     ;; Switch case.
     ((parent-is "case_statement") parent 2)
     ((node-is "case_statement") parent 0)

     ;; { and }.
     ((node-is "compound_statement") parent 2)
     ((node-is "}") parent 0)

     ;; Multi-line string.
     ((parent-is "string_literal") no-indent 0)

     ;; List.
     ,@(cl-loop for type in '("compound_statement" "initializer_list"
                              "argument_list" "parameter_list"
                              "field_declaration_list")
                collect `((match nil ,type nil 0 0) parent 2)
                collect `((match nil ,type nil 1) first-sibling 0)))))

(defvar ts-c-treesit-settings-1
  `((c
     ,(treesit-expand-query
       '((null) @font-lock-constant-face
         (true) @font-lock-constant-face
         (false) @font-lock-constant-face

         (comment) @font-lock-comment-face

         (system_lib_string) @ts-c-fontify-system-lib

         (unary_expression
          operator: _ @font-lock-negation-char-face)

         (string_literal) @font-lock-string-face
         (char_literal) @font-lock-string-face



         (function_definition
          declarator: (identifier) @font-lock-function-name-face)

         (declaration
          declarator: (identifier) @font-lock-function-name-face)

         (function_declarator
          declarator: (identifier) @font-lock-function-name-face)



         (init_declarator
          declarator: (identifier) @font-lock-variable-name-face)

         (parameter_declaration
          declarator: (identifier) @font-lock-variable-name-face)

         (preproc_def
          name: (identifier) @font-lock-variable-name-face)

         (enumerator
          name: (identifier) @font-lock-variable-name-face)

         (field_identifier) @font-lock-variable-name-face

         (parameter_list
          (parameter_declaration
           (identifier) @font-lock-variable-name-face))

         (pointer_declarator
          declarator: (identifier) @font-lock-variable-name-face)

         (array_declarator
          declarator: (identifier) @font-lock-variable-name-face)

         (preproc_function_def
          name: (identifier) @font-lock-variable-name-face
          parameters: (preproc_params
                       (identifier) @font-lock-variable-name-face))



         (type_identifier) @font-lock-type-face
         (primitive_type) @font-lock-type-face

         "auto" @font-lock-keyword-face
         "break" @font-lock-keyword-face
         "case" @font-lock-keyword-face
         "const" @font-lock-keyword-face
         "continue" @font-lock-keyword-face
         "default" @font-lock-keyword-face
         "do" @font-lock-keyword-face
         "else" @font-lock-keyword-face
         "enum" @font-lock-keyword-face
         "extern" @font-lock-keyword-face
         "for" @font-lock-keyword-face
         "goto" @font-lock-keyword-face
         "if" @font-lock-keyword-face
         "register" @font-lock-keyword-face
         "return" @font-lock-keyword-face
         "sizeof" @font-lock-keyword-face
         "static" @font-lock-keyword-face
         "struct" @font-lock-keyword-face
         "switch" @font-lock-keyword-face
         "typedef" @font-lock-keyword-face
         "union" @font-lock-keyword-face
         "volatile" @font-lock-keyword-face
         "while" @font-lock-keyword-face

         "long" @font-lock-type-face
         "short" @font-lock-type-face
         "signed" @font-lock-type-face
         "unsigned" @font-lock-type-face

         "#include" @font-lock-preprocessor-face
         "#define" @font-lock-preprocessor-face
         "#ifdef" @font-lock-preprocessor-face
         "#ifndef" @font-lock-preprocessor-face
         "#endif" @font-lock-preprocessor-face
         "#else" @font-lock-preprocessor-face
         "#elif" @font-lock-preprocessor-face
         )))))

;;; Config

(add-to-list 'auto-mode-alist '("\\.tsc\\'" . ts-c-mode))

^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-20  2:01 ` Yuan Fu
@ 2022-06-16 19:03   ` Yuan Fu
  2022-06-16 19:25     ` [External] : " Drew Adams
                       ` (4 more replies)
  0 siblings, 5 replies; 187+ messages in thread
From: Yuan Fu @ 2022-06-16 19:03 UTC (permalink / raw)
  To: Emacs Devel

Hey,

I’ve just finished with Real Life and got back to tree-sitter. I’ll reply to individual messages separately, but here is a summary of all the latest changes pushed to feature/tree-sitter

- Now one can compile a query, compiled query is much faster than uncompiled queries.
- Traversal functions now have a parameter that controls how deep to traverse.
- Removed the ltree-sitter setting in configure.ac
- Consolidated all the parser creation functions into one: treesit-parser-create, that means treesit-get-parser and treesit-get-parser-create are removed.

I think these are all the pending requests (sans highlight-paren), please let me know if I missed anything.

Moving forward, I want to make treesit-parser-list internal and turn it into a function that returns the parser list. And add a function to remove a parser from the parser list. Because I’m not comfortable letting users to remove and re-add parsers into the list anymore. Previously we determined that if a user wants to do the wrong thing, so be it. But now I realized that there could be danger in crashing Emacs if user fiddle with treesit-parser-list incorrectly (and violates some of the assertions I put in).

Can I just add a new Lisp_Object field in struct buffer? I assume that’s how you add an internal buffer-local data?

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* RE: [External] : Re: Tree-sitter integration on feature/tree-sitter
  2022-06-16 19:03   ` Yuan Fu
@ 2022-06-16 19:25     ` Drew Adams
  2022-06-17  1:11       ` Yuan Fu
  2022-06-17  1:24     ` Po Lu
                       ` (3 subsequent siblings)
  4 siblings, 1 reply; 187+ messages in thread
From: Drew Adams @ 2022-06-16 19:25 UTC (permalink / raw)
  To: Yuan Fu, Emacs Devel

> I’ve just finished with Real Life and got back to tree-sitter. 

The afterlife revealed, finally. ;-)


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: [External] : Re: Tree-sitter integration on feature/tree-sitter
  2022-06-16 19:25     ` [External] : " Drew Adams
@ 2022-06-17  1:11       ` Yuan Fu
  2022-06-17 14:22         ` Drew Adams
  0 siblings, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-06-17  1:11 UTC (permalink / raw)
  To: Drew Adams; +Cc: Emacs Devel



> On Jun 16, 2022, at 12:25 PM, Drew Adams <drew.adams@oracle.com> wrote:
> 
>> I’ve just finished with Real Life and got back to tree-sitter. 
> 
> The afterlife revealed, finally. ;-)
> 

I’ll be very glad if my afterlife includes hacking Emacs.

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-16 19:03   ` Yuan Fu
  2022-06-16 19:25     ` [External] : " Drew Adams
@ 2022-06-17  1:24     ` Po Lu
  2022-06-18  0:09       ` Yuan Fu
  2022-06-17  2:00     ` Ihor Radchenko
                       ` (2 subsequent siblings)
  4 siblings, 1 reply; 187+ messages in thread
From: Po Lu @ 2022-06-17  1:24 UTC (permalink / raw)
  To: Yuan Fu; +Cc: Emacs Devel

Yuan Fu <casouri@gmail.com> writes:

> Can I just add a new Lisp_Object field in struct buffer? I assume
> that’s how you add an internal buffer-local data?

Yes.  Make sure the field is placed before
`cursor_in_non_selected_windows_', or it won't be traced by GC.

Also make sure to access it using the `BVAR' macro and add a
corresponding `DEFVAR_PER_BUFFER' form in syms_of_buffer if it's
supposed to be a buffer local variable.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-16 19:03   ` Yuan Fu
  2022-06-16 19:25     ` [External] : " Drew Adams
  2022-06-17  1:24     ` Po Lu
@ 2022-06-17  2:00     ` Ihor Radchenko
  2022-06-17  2:25       ` Lower-level change hook immune to with-silent-modifications Yuan Fu
  2022-06-17  5:23       ` Tree-sitter integration on feature/tree-sitter Eli Zaretskii
  2022-06-17  6:15     ` Tree-sitter integration on feature/tree-sitter Eli Zaretskii
  2022-06-17 11:06     ` Jostein Kjønigsen
  4 siblings, 2 replies; 187+ messages in thread
From: Ihor Radchenko @ 2022-06-17  2:00 UTC (permalink / raw)
  To: Yuan Fu; +Cc: Emacs Devel

Yuan Fu <casouri@gmail.com> writes:

> I’ve just finished with Real Life and got back to tree-sitter. I’ll reply to individual messages separately, but here is a summary of all the latest changes pushed to feature/tree-sitter

Would it be possible to expose ts_record_change to Elisp?

I am asking in the interest of Org mode parser that is also parsing the
buffer AST and tracks buffer modifications.

The built-in after-change-functions are not reliable because they can be
(and often are) easily suppressed by with-silent-modifications macro.
See bug#46982 and bug#51766.

Best,
Ihor




^ permalink raw reply	[flat|nested] 187+ messages in thread

* Lower-level change hook immune to with-silent-modifications
  2022-06-17  2:00     ` Ihor Radchenko
@ 2022-06-17  2:25       ` Yuan Fu
  2022-06-17  2:55         ` Stefan Monnier
  2022-06-17  5:23       ` Tree-sitter integration on feature/tree-sitter Eli Zaretskii
  1 sibling, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-06-17  2:25 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: Emacs Devel

> 
> Would it be possible to expose ts_record_change to Elisp?
> 
> I am asking in the interest of Org mode parser that is also parsing the
> buffer AST and tracks buffer modifications.
> 
> The built-in after-change-functions are not reliable because they can be
> (and often are) easily suppressed by with-silent-modifications macro.
> See bug#46982 and bug#51766.

I think you probably want a separate hook just for this purpose, rather than repurposing ts_record_change. We could have a lower-level after-change-functions that is immune to with-silent-modifications. Whether we should add such hook is probably another discussion. (So I opened a new thread.) I think it will be handy, but I don’t know that problem it might cause.

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Lower-level change hook immune to with-silent-modifications
  2022-06-17  2:25       ` Lower-level change hook immune to with-silent-modifications Yuan Fu
@ 2022-06-17  2:55         ` Stefan Monnier
  2022-06-17  5:28           ` Eli Zaretskii
  0 siblings, 1 reply; 187+ messages in thread
From: Stefan Monnier @ 2022-06-17  2:55 UTC (permalink / raw)
  To: Yuan Fu; +Cc: Ihor Radchenko, Emacs Devel

> I think you probably want a separate hook just for this purpose, rather than
> repurposing ts_record_change. We could have a lower-level
> after-change-functions that is immune to with-silent-modifications. Whether
> we should add such hook is probably another discussion. (So I opened a new
> thread.) I think it will be handy, but I don’t know that problem it
> might cause.

As I just argued in bug#51766, I don't think it makes sense to try to
have such "a lower-level after-change-functions that is immune to
with-silent-modifications".


        Stefan




^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-17  2:00     ` Ihor Radchenko
  2022-06-17  2:25       ` Lower-level change hook immune to with-silent-modifications Yuan Fu
@ 2022-06-17  5:23       ` Eli Zaretskii
  2022-06-17 10:40         ` Ihor Radchenko
  1 sibling, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-06-17  5:23 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: casouri, emacs-devel

> From: Ihor Radchenko <yantar92@gmail.com>
> Cc: Emacs Devel <emacs-devel@gnu.org>
> Date: Fri, 17 Jun 2022 10:00:04 +0800
> 
> Would it be possible to expose ts_record_change to Elisp?
> 
> I am asking in the interest of Org mode parser that is also parsing the
> buffer AST and tracks buffer modifications.

Please tell more about the need.  I'm not happy with exposing this to
Lisp, and don't understand why the low-level parts of parsing the
buffer AST should be written in Lisp in the first place.  The
tree-sitter branch does this in C for that very reason.

We could rename ts_record_change to something more general, of course,
and make it available even if Emacs is not compiled with TS, if it can
be useful for other needs.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Lower-level change hook immune to with-silent-modifications
  2022-06-17  2:55         ` Stefan Monnier
@ 2022-06-17  5:28           ` Eli Zaretskii
  2022-06-17 10:10             ` Ihor Radchenko
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-06-17  5:28 UTC (permalink / raw)
  To: Stefan Monnier; +Cc: casouri, yantar92, emacs-devel

> From: Stefan Monnier <monnier@iro.umontreal.ca>
> Cc: Ihor Radchenko <yantar92@gmail.com>,  Emacs Devel <emacs-devel@gnu.org>
> Date: Thu, 16 Jun 2022 22:55:50 -0400
> 
> > I think you probably want a separate hook just for this purpose, rather than
> > repurposing ts_record_change. We could have a lower-level
> > after-change-functions that is immune to with-silent-modifications. Whether
> > we should add such hook is probably another discussion. (So I opened a new
> > thread.) I think it will be handy, but I don’t know that problem it
> > might cause.
> 
> As I just argued in bug#51766, I don't think it makes sense to try to
> have such "a lower-level after-change-functions that is immune to
> with-silent-modifications".

I tend to agree.  We can discuss the specific needs that triggered
that request, but by and large, we have a good reason to have
inhibit-modification-hooks that affects any Lisp program that wants to
know about buffer modifications.  That's the difference between the
Lisp level and the lower-level code in C, which "knows everything",
including when it isn't safe to use some data or some objects.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-16 19:03   ` Yuan Fu
                       ` (2 preceding siblings ...)
  2022-06-17  2:00     ` Ihor Radchenko
@ 2022-06-17  6:15     ` Eli Zaretskii
  2022-06-17  7:17       ` Yuan Fu
  2022-06-17 11:06     ` Jostein Kjønigsen
  4 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-06-17  6:15 UTC (permalink / raw)
  To: Yuan Fu; +Cc: emacs-devel

> From: Yuan Fu <casouri@gmail.com>
> Date: Thu, 16 Jun 2022 12:03:08 -0700
> 
> Moving forward, I want to make treesit-parser-list internal and turn it into a function that returns the parser list. And add a function to remove a parser from the parser list.

And a function to add to the list, right?  Or does it already exist?



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-17  6:15     ` Tree-sitter integration on feature/tree-sitter Eli Zaretskii
@ 2022-06-17  7:17       ` Yuan Fu
  2022-06-17 10:37         ` Eli Zaretskii
  0 siblings, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-06-17  7:17 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: emacs-devel



> On Jun 16, 2022, at 11:15 PM, Eli Zaretskii <eliz@gnu.org> wrote:
> 
>> From: Yuan Fu <casouri@gmail.com>
>> Date: Thu, 16 Jun 2022 12:03:08 -0700
>> 
>> Moving forward, I want to make treesit-parser-list internal and turn it into a function that returns the parser list. And add a function to remove a parser from the parser list.
> 
> And a function to add to the list, right?  Or does it already exist?

Creating a parser automatically adds it to the parser list of a buffer. The purpose of the parser list is to update each parser when buffer content changed. So you don’t want to remove a parser from the list and add it back: it would be out-of-sync.

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Lower-level change hook immune to with-silent-modifications
  2022-06-17  5:28           ` Eli Zaretskii
@ 2022-06-17 10:10             ` Ihor Radchenko
  2022-06-17 11:03               ` Eli Zaretskii
  0 siblings, 1 reply; 187+ messages in thread
From: Ihor Radchenko @ 2022-06-17 10:10 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: Stefan Monnier, casouri, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

>> As I just argued in bug#51766, I don't think it makes sense to try to
>> have such "a lower-level after-change-functions that is immune to
>> with-silent-modifications".
>
> I tend to agree.  We can discuss the specific needs that triggered
> that request, but by and large, we have a good reason to have
> inhibit-modification-hooks that affects any Lisp program that wants to
> know about buffer modifications.  That's the difference between the
> Lisp level and the lower-level code in C, which "knows everything",
> including when it isn't safe to use some data or some objects.

Now I am wondering why tree-sitter should be any different.
Apparently the existing after-change-functions functionality was not
sufficient for tree-sitter. Probably because of issues similar to
bug#51766. Can more fine-grained modification info be exposed to Elisp?

Best,
Ihor





^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-17  7:17       ` Yuan Fu
@ 2022-06-17 10:37         ` Eli Zaretskii
  2022-06-18  0:14           ` Yuan Fu
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-06-17 10:37 UTC (permalink / raw)
  To: Yuan Fu; +Cc: emacs-devel

> From: Yuan Fu <casouri@gmail.com>
> Date: Fri, 17 Jun 2022 00:17:54 -0700
> Cc: emacs-devel@gnu.org
> 
> >> Moving forward, I want to make treesit-parser-list internal and turn it into a function that returns the parser list. And add a function to remove a parser from the parser list.
> > 
> > And a function to add to the list, right?  Or does it already exist?
> 
> Creating a parser automatically adds it to the parser list of a buffer.

Then removing a parser means we actually delete it?



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-17  5:23       ` Tree-sitter integration on feature/tree-sitter Eli Zaretskii
@ 2022-06-17 10:40         ` Ihor Radchenko
  2022-06-17 11:42           ` Exposing buffer text modifications to Lisp (was: Tree-sitter integration on feature/tree-sitter) Eli Zaretskii
  0 siblings, 1 reply; 187+ messages in thread
From: Ihor Radchenko @ 2022-06-17 10:40 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: casouri, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

>> I am asking in the interest of Org mode parser that is also parsing the
>> buffer AST and tracks buffer modifications.
>
> Please tell more about the need.  I'm not happy with exposing this to
> Lisp, and don't understand why the low-level parts of parsing the
> buffer AST should be written in Lisp in the first place.  The
> tree-sitter branch does this in C for that very reason.

AFAIK, tree-sitter branch does not do anything related to _writing_
parsers. Parsers are implemented via tree-sitter modules.

Org mode parses Org markup elements in buffer into AST structure.
This AST structure is used to fontify Org buffers, modify various
elements, query element properties, build lists of matching elements
according to user queries (agenda), etc

The Org mode parser is implementing pretty much the same features
tree-sitter provides (except that the relevant Org code was in place
before tree-sitter became a thing): Only parts of Org buffer are parsed
as needed; buffer modifications trigger updates only within the affected
parts of the AST.

Thanks to the parser, Org is able to handle quite large buffers. Our
parser written in Lisp and yet it can parse a 15Mb Org file within 17sec
vs. 8sec if parsed using the available incomplete tree-sitter Org parser
(https://github.com/milisims/tree-sitter-org).

Note that unlike tree-sitter, Org parser is able to change syntax using
Elisp. For example, adding new link element types is trivial with a
number of ol-*.el libraries provided by Org and third-party packages.

Moreover, the on-demand parsing makes even 15Mb Org files responsive on
runtime with little issues. I was able to get a bearable performance even
in 100Mb Org file.

Org mode parser with all its flexibility would be difficult to implement
using tree-sitter.

As for implementing in C, I am not even sure how to approach this. Emacs
does provide external module, but AFAIU modules communicate with Emacs
process via print-ing/read-ing strings and the internal Emacs-C
functions are not available. I am not convinced that the speed
difference will be worth it to bother rewriting the whole parser in
Emacs-C.

Best,
Ihor



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Lower-level change hook immune to with-silent-modifications
  2022-06-17 10:10             ` Ihor Radchenko
@ 2022-06-17 11:03               ` Eli Zaretskii
  0 siblings, 0 replies; 187+ messages in thread
From: Eli Zaretskii @ 2022-06-17 11:03 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: monnier, casouri, emacs-devel

> From: Ihor Radchenko <yantar92@gmail.com>
> Cc: Stefan Monnier <monnier@iro.umontreal.ca>,  casouri@gmail.com,
>   emacs-devel@gnu.org
> Date: Fri, 17 Jun 2022 18:10:46 +0800
> 
> Eli Zaretskii <eliz@gnu.org> writes:
> 
> >> As I just argued in bug#51766, I don't think it makes sense to try to
> >> have such "a lower-level after-change-functions that is immune to
> >> with-silent-modifications".
> >
> > I tend to agree.  We can discuss the specific needs that triggered
> > that request, but by and large, we have a good reason to have
> > inhibit-modification-hooks that affects any Lisp program that wants to
> > know about buffer modifications.  That's the difference between the
> > Lisp level and the lower-level code in C, which "knows everything",
> > including when it isn't safe to use some data or some objects.
> 
> Now I am wondering why tree-sitter should be any different.
> Apparently the existing after-change-functions functionality was not
> sufficient for tree-sitter. Probably because of issues similar to
> bug#51766. Can more fine-grained modification info be exposed to Elisp?

tree-sitter isn't different: it does that in C, as part of the
low-level Emacs code which manipulates changes in buffer text.

My response above was about exposing that to Lisp, not about letting
features access buffer text in general.

after-change-functions is not the right tool for accessing buffer
text, they are a means to signal to Lisp that _some_ change happened
in buffer text which Lisp program _may_ wish to know about, and the
core reserves the right not to tell Lisp about some of the changes via
that hook.  Programs that _must_ know about each and every change in
buffer text cannot be written in Lisp, because there are changes that
I don't even know how to tell to a Lisp program in terms it will
understand.  For example, what if the buffer was changed from
multibyte to unibyte, or vice versa?  Or how to describe efficiently a
change in text properties?

Asking to have every aspect of the Emacs internals be exposed to Lisp
is NOT the right way of implementing features in Emacs!  Instead,
whenever the existing facilities are insufficient or don't allow you
to do the job, please describe the job you need to do, and let's
discuss how best to divide the implementation between the C primitives
(whether existing or new) and the Lisp application code.

Most of Emacs is written in Lisp to allow flexibility and safety, not
because we don't like C.  So the line that divides the C from the Lisp
parts of the implementation should consider which parts need to be
easily modified and which don't, and also which internals, if exposed
to Lisp, could easily lead to runaway applications wedging or crashing
Emacs.  These are non-trivial aspects, and the decision is not always
self-evident (though sometimes it is).



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-16 19:03   ` Yuan Fu
                       ` (3 preceding siblings ...)
  2022-06-17  6:15     ` Tree-sitter integration on feature/tree-sitter Eli Zaretskii
@ 2022-06-17 11:06     ` Jostein Kjønigsen
  2022-06-18  0:28       ` Yuan Fu
  4 siblings, 1 reply; 187+ messages in thread
From: Jostein Kjønigsen @ 2022-06-17 11:06 UTC (permalink / raw)
  To: Yuan Fu, Emacs Devel

[-- Attachment #1: Type: text/plain, Size: 9435 bytes --]

On 16.06.2022 21:03, Yuan Fu wrote:
> Hey,
>
> I’ve just finished with Real Life and got back to tree-sitter. I’ll reply to individual messages separately, but here is a summary of all the latest changes pushed to feature/tree-sitter
>
> - Now one can compile a query, compiled query is much faster than uncompiled queries.
> - Traversal functions now have a parameter that controls how deep to traverse.
> - Removed the ltree-sitter setting in configure.ac
> - Consolidated all the parser creation functions into one: treesit-parser-create, that means treesit-get-parser and treesit-get-parser-create are removed.
>
> I think these are all the pending requests (sans highlight-paren), please let me know if I missed anything.
>
> Moving forward, I want to make treesit-parser-list internal and turn it into a function that returns the parser list. And add a function to remove a parser from the parser list. Because I’m not comfortable letting users to remove and re-add parsers into the list anymore. Previously we determined that if a user wants to do the wrong thing, so be it. But now I realized that there could be danger in crashing Emacs if user fiddle with treesit-parser-list incorrectly (and violates some of the assertions I put in).
>
> Can I just add a new Lisp_Object field in struct buffer? I assume that’s how you add an internal buffer-local data?
>
> Yuan

Nice update! Good work!

Trying latest source from emacs feature/tree-sitter branch though, and 
updating my code to use treesite-parser-create rather than 
treesit-get-parser-create... I have emacs segfaulting because of a 
double-free.

    jostein@ThinkPad-T14s:~/build/emacs$ emacs
    double free or corruption (out)
    Fatal error 6: Aborted

Running it through gdb gets me this result:

    jostein@ThinkPad-T14s:~/build/emacs$ gdb
    /home/jostein/build/emacs/src/emacs
    GNU gdb (Ubuntu 12.0.90-0ubuntu1) 12.0.90
    Copyright (C) 2022 Free Software Foundation, Inc.
    License GPLv3+: GNU GPL version 3 or later
    <http://gnu.org/licenses/gpl.html>
    This is free software: you are free to change and redistribute it.
    There is NO WARRANTY, to the extent permitted by law.
    Type "show copying" and "show warranty" for details.
    This GDB was configured as "x86_64-linux-gnu".
    Type "show configuration" for configuration details.
    For bug reporting instructions, please see:
    <https://www.gnu.org/software/gdb/bugs/>.
    Find the GDB manual and other documentation resources online at:
    <http://www.gnu.org/software/gdb/documentation/>.

    For help, type "help".
    Type "apropos word" to search for commands related to "word"...
    Reading symbols from /home/jostein/build/emacs/src/emacs...
    (gdb) r
    Starting program: /home/jostein/build/emacs/src/emacs
    [Thread debugging using libthread_db enabled]
    Using host libthread_db library
    "/lib/x86_64-linux-gnu/libthread_db.so.1".
    [New Thread 0x7ffff11bc640 (LWP 54902)]
    [New Thread 0x7ffff086d640 (LWP 54903)]
    [New Thread 0x7fffebf3d640 (LWP 54904)]
    [New Thread 0x7fffeb5b4640 (LWP 54905)]
    [New Thread 0x7fffeadb3640 (LWP 54906)]
    [New Thread 0x7fffea50c640 (LWP 54907)]
    [Thread 0x7fffea50c640 (LWP 54907) exited]
    [New Thread 0x7fffea50c640 (LWP 54908)]
    [New Thread 0x7fffe9d0b640 (LWP 54909)]
    [Thread 0x7fffea50c640 (LWP 54908) exited]
    [Thread 0x7fffe9d0b640 (LWP 54909) exited]
    [New Thread 0x7fffe9d0b640 (LWP 54910)]
    [New Thread 0x7fffea50c640 (LWP 54911)]
    [Thread 0x7fffe9d0b640 (LWP 54910) exited]
    [Thread 0x7fffea50c640 (LWP 54911) exited]
    [Thread 0x7fffeadb3640 (LWP 54906) exited]
    [Detaching after vfork from child process 54913]
    [Detaching after vfork from child process 54914]
    [Detaching after vfork from child process 54922]
    [Detaching after vfork from child process 54924]
    [Detaching after vfork from child process 54929]
    [Detaching after vfork from child process 54930]
    [Detaching after vfork from child process 54964]
    [Detaching after vfork from child process 54965]
    [Detaching after vfork from child process 54994]
    [Detaching after vfork from child process 54995]
    [Detaching after vfork from child process 54996]
    [Detaching after vfork from child process 54997]
    [Detaching after vfork from child process 54998]
    [Detaching after vfork from child process 54999]
    [Detaching after vfork from child process 55001]
    [Detaching after vfork from child process 55003]
    [Detaching after vfork from child process 55004]
    [Thread 0x7fffeb5b4640 (LWP 54905) exited]
    [Detaching after vfork from child process 55044]
    [Detaching after vfork from child process 55045]
    [Detaching after vfork from child process 55046]
    [Detaching after vfork from child process 55047]
    [Detaching after vfork from child process 55048]

    Thread 1 "emacs" received signal SIGSEGV, Segmentation fault.
    0x00007ffff58de39f in ts_query_delete () from
    /usr/local/lib/libtree-sitter.so.0
    (gdb) bt
    #0  0x00007ffff58de39f in ts_query_delete () at
    /usr/local/lib/libtree-sitter.so.0
    #1  0x000055555573e849 in cleanup_vector (vector=<optimized out>) at
    alloc.c:3184
    #2  sweep_vectors () at alloc.c:3259
    #3  0x0000555555743a50 in gc_sweep () at alloc.c:7413
    #4  garbage_collect () at alloc.c:6259
    #5  0x0000555555743f11 in maybe_garbage_collect () at alloc.c:6108
    #6  0x0000555555765e85 in maybe_gc () at
    /home/jostein/build/emacs/src/lisp.h:5539
    #7  Ffuncall (nargs=nargs@entry=2, args=args@entry=0x7fffffff9310)
    at eval.c:2961
    #8  0x0000555555764751 in internal_condition_case_n
          (bfun=0x555555765cf0 <Ffuncall>, nargs=nargs@entry=2,
    args=args@entry=0x7fffffff9310, handlers=handlers@entry=0x30,
    hfun=hfun@entry=0x5555555ded20 <safe_eval_handler>) at eval.c:1565
    #9  0x00005555555c9e13 in safe__call
    (inhibit_quit=inhibit_quit@entry=false, nargs=nargs@entry=2,
    func=func@entry=0xb160, ap=ap@entry=0x7fffffff9390) at xdisp.c:3015
    #10 0x00005555555dd3b6 in safe_call (nargs=nargs@entry=2,
    func=func@entry=0xb160) at xdisp.c:3030
    #11 0x00005555555fed32 in safe_call1 (arg=0x55555643771d, fn=0xb160)
    at xdisp.c:3041
    #12 display_mode_lines (w=w@entry=0x555556437718) at xdisp.c:26098
    #13 0x0000555555614869 in redisplay_window (window=<optimized out>,
    just_this_one_p=<optimized out>) at xdisp.c:19894
    #14 0x0000555555618063 in redisplay_window_0
    (window=window@entry=0x55555643771d) at xdisp.c:17148
    #15 0x00005555557645fc in internal_condition_case_1
         (bfun=bfun@entry=0x555555618030 <redisplay_window_0>,
    arg=arg@entry=0x55555643771d, handlers=<optimized out>,
    hfun=hfun@entry=0x5555555c8ee0 <redisplay_window_error>) at eval.c:1509
    #16 0x00005555555caf49 in redisplay_windows (window=0x55555643771d)
    at xdisp.c:17128
    #17 0x00005555555ffe0d in redisplay_internal () at xdisp.c:16595
    #18 0x0000555555601414 in redisplay_preserve_echo_area
    (from_where=from_where@entry=9) at xdisp.c:16944
    #19 0x00005555557bdc4f in wait_reading_process_output
         (time_limit=time_limit@entry=0, nsecs=nsecs@entry=0,
    read_kbd=read_kbd@entry=-1, do_display=true,
    wait_for_cell=wait_for_cell@entry=0x0,
    wait_proc=wait_proc@entry=0x0, just_wait_proc=0) at process.c:5334
    #20 0x00005555556df7a7 in kbd_buffer_get_event (end_time=0x0,
    used_mouse_menu=0x7fffffffdb2b, kbp=<synthetic pointer>) at
    keyboard.c:3953
    #21 read_event_from_main_queue (end_time=<optimized out>,
    local_getcjmp=0x7fffffffd810, used_mouse_menu=0x7fffffffdb2b) at
    keyboard.c:2225
    #22 0x00005555556e55bb in read_decoded_event_from_main_queue
    (used_mouse_menu=<optimized out>, prev_event=<optimized out>,
    local_getcjmp=<optimized out>, end_time=<optimized out>) at
    keyboard.c:2288
    #23 read_char (commandflag=1, map=0x555558bfd8f3, prev_event=0x0,
    used_mouse_menu=0x7fffffffdb2b, end_time=0x0) at keyboard.c:2919
    #24 0x00005555556e7326 in read_key_sequence (keybuf=<optimized out>,
    prompt=0x0, dont_downcase_last=<optimized out>,
    can_return_switch_frame=true, fix_current_buffer=true,
    prevent_redisplay=<optimized out>)
         at keyboard.c:9965
    #25 0x00005555556e8fbc in command_loop_1 () at keyboard.c:1391
    #26 0x0000555555764567 in internal_condition_case
    (bfun=bfun@entry=0x5555556e8d70 <command_loop_1>,
    handlers=handlers@entry=0x90, hfun=hfun@entry=0x5555556dc5b0
    <cmd_error>) at eval.c:1485
    #27 0x00005555556d4c7e in command_loop_2
    (handlers=handlers@entry=0x90) at keyboard.c:1132
    #28 0x00005555557644a9 in internal_catch (tag=tag@entry=0xf6c0,
    func=func@entry=0x5555556d4c50 <command_loop_2>, arg=arg@entry=0x90)
    at eval.c:1208
    #29 0x00005555556d4c19 in command_loop () at keyboard.c:1110
    #30 0x00005555556dc108 in recursive_edit_1 () at keyboard.c:719
    #31 0x00005555556dc4b0 in Frecursive_edit () at keyboard.c:802
    #32 0x00005555555adf54 in main (argc=<optimized out>,
    argv=<optimized out>) at emacs.c:2518
    (gdb)


This is on "plain" Ubuntu 22.04, x86_64.

Emacs compiled using: ./configure --with-tree-sitter && make -j4

No other features or "experimental" things enabled.



-- 
Vennlig hilsen
*Jostein Kjønigsen*

jostein@kjonigsen.net 🍵 jostein@gmail.com
https://jostein.kjønigsen.no <https://jostein.kjønigsen.no>

[-- Attachment #2: Type: text/html, Size: 12337 bytes --]

^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Exposing buffer text modifications to Lisp (was: Tree-sitter integration on feature/tree-sitter)
  2022-06-17 10:40         ` Ihor Radchenko
@ 2022-06-17 11:42           ` Eli Zaretskii
  2022-06-18  5:52             ` Ihor Radchenko
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-06-17 11:42 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: casouri, emacs-devel

[I've changed the Subject, since this is not longer about tree-sitter.]

> From: Ihor Radchenko <yantar92@gmail.com>
> Cc: casouri@gmail.com,  emacs-devel@gnu.org
> Date: Fri, 17 Jun 2022 18:40:52 +0800
> 
> AFAIK, tree-sitter branch does not do anything related to _writing_
> parsers. Parsers are implemented via tree-sitter modules.

That is correct.  However, tree-sitter support is called for certain
changes in buffer text because tree-sitter needs direct and efficient
access to buffer text when those certain changes happen, and that
cannot be provided in Lisp.  There was a long discussion several
months ago where we came to this conclusion; the original design ideas
were different, and indeed at least some of them were based on
buffer-substring, which IMO is a terrible idea for this class of
features.

> Org mode parses Org markup elements in buffer into AST structure.
> This AST structure is used to fontify Org buffers, modify various
> elements, query element properties, build lists of matching elements
> according to user queries (agenda), etc
> 
> The Org mode parser is implementing pretty much the same features
> tree-sitter provides (except that the relevant Org code was in place
> before tree-sitter became a thing): Only parts of Org buffer are parsed
> as needed; buffer modifications trigger updates only within the affected
> parts of the AST.

OK, but that still doesn't tell what you need from the Emacs core.
Can you describe those needs?  I presume that modification hooks (of
any kind) are just the means; the real need is something else.  What
is it?  If (as I presume) you need to know about changes to the
buffer, then can you enumerate the changes that are of interest?  For
example, are changes in text properties and overlays of interest, and
if so, what kind of properties/overlays?  (But please don't limit your
answers to just text properties and overlays, because I asked about
them explicitly.)

Next, what kind of ASTs do you want to build, and how do you
represent text as AST?  In particular, is the AST defined by regexps
or some other Lisp data structures?

> As for implementing in C, I am not even sure how to approach this.

This is what needs to be discussed.  Emacs does have features
implemented partially in Lisp and partially in C, so this is not
impossible, far from that.  One example that comes to mind is
character composition -- a feature of the display engine that is
completely controlled by Lisp data structures that can be easily
changed at run time.  So, once we understand the needs and the
requirements, I'm quite sure ideas about the possible implementations
will not have us waiting for long.

> Emacs does provide external module, but AFAIU modules communicate
> with Emacs process via print-ing/read-ing strings and the internal
> Emacs-C functions are not available. I am not convinced that the
> speed difference will be worth it to bother rewriting the whole
> parser in Emacs-C.

I wasn't suggesting using modules.  Modules are intentionally limited
in their access to the Emacs internals.  For a core feature like the
one you are describing, using modules makes no sense at all.  No, I
was talking about providing new primitives and/or extending existing
primitives in order to support these features you want to provide in
Org (and also potentially to enable implementation of other similar
features by other packages).

As for speed, I suggest to delay the discussion of that until we have
a better understanding of the requirements and their various aspects,
and have some ideas regarding the possible implementations.  Even if
eventually there will be no gain in speed (and I find that hard to
believe), the safety of keeping some of the implementation un-exposed
to Lisp could well be worth our while.  Speed alone is not a
good-enough reason to implement something in C, especially if Lisp
performance is acceptable.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* RE: [External] : Re: Tree-sitter integration on feature/tree-sitter
  2022-06-17  1:11       ` Yuan Fu
@ 2022-06-17 14:22         ` Drew Adams
  0 siblings, 0 replies; 187+ messages in thread
From: Drew Adams @ 2022-06-17 14:22 UTC (permalink / raw)
  To: Yuan Fu; +Cc: Emacs Devel

> >> I’ve just finished with Real Life and got back to tree-sitter.
> >
> > The afterlife revealed, finally. ;-)
> 
> I’ll be very glad if my afterlife includes hacking Emacs.

You may need to autoload package AfterLife,
or perhaps add a suitable function to
`after-life-hook'.

^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-17  1:24     ` Po Lu
@ 2022-06-18  0:09       ` Yuan Fu
  0 siblings, 0 replies; 187+ messages in thread
From: Yuan Fu @ 2022-06-18  0:09 UTC (permalink / raw)
  To: Po Lu; +Cc: Emacs Devel



> On Jun 16, 2022, at 6:24 PM, Po Lu <luangruo@yahoo.com> wrote:
> 
> Yuan Fu <casouri@gmail.com> writes:
> 
>> Can I just add a new Lisp_Object field in struct buffer? I assume
>> that’s how you add an internal buffer-local data?
> 
> Yes.  Make sure the field is placed before
> `cursor_in_non_selected_windows_', or it won't be traced by GC.
> 
> Also make sure to access it using the `BVAR' macro and add a
> corresponding `DEFVAR_PER_BUFFER' form in syms_of_buffer if it's
> supposed to be a buffer local variable.

I don’t plan to expose it as a variable, so I don’t need DEFVAR_PER_BUFFER, is that correct?

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-17 10:37         ` Eli Zaretskii
@ 2022-06-18  0:14           ` Yuan Fu
  2022-06-18  6:22             ` Eli Zaretskii
  0 siblings, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-06-18  0:14 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: emacs-devel



> On Jun 17, 2022, at 3:37 AM, Eli Zaretskii <eliz@gnu.org> wrote:
> 
>> From: Yuan Fu <casouri@gmail.com>
>> Date: Fri, 17 Jun 2022 00:17:54 -0700
>> Cc: emacs-devel@gnu.org
>> 
>>>> Moving forward, I want to make treesit-parser-list internal and turn it into a function that returns the parser list. And add a function to remove a parser from the parser list.
>>> 
>>> And a function to add to the list, right?  Or does it already exist?
>> 
>> Creating a parser automatically adds it to the parser list of a buffer.
> 
> Then removing a parser means we actually delete it?

Not sure what do you men “delete”. Treesit-parser-delete removes the parser from the parser list of a buffer, so it is never kept up-to-date with that buffer again. But you can still do stuff with it until it is gc’ed. I probably should add checks that prohibit using a parser after it has been deleted.

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-17 11:06     ` Jostein Kjønigsen
@ 2022-06-18  0:28       ` Yuan Fu
  2022-06-18 20:57         ` Jostein Kjønigsen
  0 siblings, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-06-18  0:28 UTC (permalink / raw)
  To: jostein; +Cc: Emacs Devel, Yoav Marco


> I tried to run the benchmarks again real quick, and ran into a segfault.
> It occurs in the call to ts_query_delete in cleanup_vector when
> garbage collecting.
> 
> I'll try to gather more info tomorrow, going to bed now.
> 
> Yoav


> 
> Nice update! Good work!
> 
> Trying latest source from emacs feature/tree-sitter branch though, and updating my code to use treesite-parser-create rather than treesit-get-parser-create... I have emacs segfaulting because of a double-free.


I’ve figure out the problem. It is due to my misunderstanding of how gc works. I’ve pushed a fix.

On a separate note, I also pushed the change that makes treesit-parser-list a function (rather than a variable).

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Exposing buffer text modifications to Lisp (was: Tree-sitter integration on feature/tree-sitter)
  2022-06-17 11:42           ` Exposing buffer text modifications to Lisp (was: Tree-sitter integration on feature/tree-sitter) Eli Zaretskii
@ 2022-06-18  5:52             ` Ihor Radchenko
  2022-06-18  7:01               ` Eli Zaretskii
  0 siblings, 1 reply; 187+ messages in thread
From: Ihor Radchenko @ 2022-06-18  5:52 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: casouri, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

> [I've changed the Subject, since this is not longer about tree-sitter.]

Well. I had some hope that we can generalize the tree-sitter interface
to allow Elisp-based parsers, but it is just a wish.

> OK, but that still doesn't tell what you need from the Emacs core.
> Can you describe those needs?  I presume that modification hooks (of
> any kind) are just the means; the real need is something else.  What
> is it?  If (as I presume) you need to know about changes to the
> buffer, then can you enumerate the changes that are of interest?  For
> example, are changes in text properties and overlays of interest, and
> if so, what kind of properties/overlays?  (But please don't limit your
> answers to just text properties and overlays, because I asked about
> them explicitly.)

Valid question. I am a bit too familiar with Org parser code and assume
that some things are "obvious" when they are not.

I will first answer about AST.

> Next, what kind of ASTs do you want to build, and how do you
> represent text as AST?  In particular, is the AST defined by regexps
> or some other Lisp data structures?

Org AST represents semantic objects using nested lists.
Similar to tree-sitter (AFAIU), each object in the tree is represented
by

(object-type (object-plist) object-children ...)

for example:

* test headline :tag:

is represented as

(headline
  (:raw-value "test headline" :begin 292 :end 314 ... :tags ("tag") ... :parent (...))
  ;; no children
   )

Upon modifying text inside the headline, we need to update :begin/:end
properties to reflect the new headline boundaries in buffer and possibly
update headline properties (e.g. :tags).

The same should be done for all the elements containing the headline.

Updating the elements require the following information:

1. Whether modified text contained terminal symbols or text contributing
   to object-plist _before_ modification.
2. The boundaries of the edited text in buffer and change in the text
   length.
3. Whether the modified text contain terminal symbols/text contributing
   to object-plist _after_ modification.

Org does not care about text property changes or overlay changes.
We just perform a series of regexp searches over the changed parts of
buffer (possibly with extended boundaries) before and after the
modification + know which region of text has been modified (its begin,
end, and change in length).

Missing any significant change (the one involving terminal symbols or
changing region length) will make the AST invalid.

Hope it clarifies the needs.

Best,
Ihor



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-18  0:14           ` Yuan Fu
@ 2022-06-18  6:22             ` Eli Zaretskii
  2022-06-18  8:25               ` Yuan Fu
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-06-18  6:22 UTC (permalink / raw)
  To: Yuan Fu; +Cc: emacs-devel

> From: Yuan Fu <casouri@gmail.com>
> Date: Fri, 17 Jun 2022 17:14:58 -0700
> Cc: emacs-devel@gnu.org
> 
> >>>> Moving forward, I want to make treesit-parser-list internal and turn it into a function that returns the parser list. And add a function to remove a parser from the parser list.
> >>> 
> >>> And a function to add to the list, right?  Or does it already exist?
> >> 
> >> Creating a parser automatically adds it to the parser list of a buffer.
> > 
> > Then removing a parser means we actually delete it?
> 
> Not sure what do you men “delete”.

If creating a parser adds it to the list, then I guessed the semantics
of removing from the list is the opposite: having the parser no longer
exist, i.e. "delete" it.

But now I'm confused by what you say here:

> Treesit-parser-delete removes the parser from the parser list of a buffer, so it is never kept up-to-date with that buffer again. But you can still do stuff with it until it is gc’ed.

If we already have treesit-parser-delete, and that call removes the
parser from the list, then why would we need a function "to remove a
parser from the list"?  It sounds like treesit-parser-delete already
does it?



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Exposing buffer text modifications to Lisp (was: Tree-sitter integration on feature/tree-sitter)
  2022-06-18  5:52             ` Ihor Radchenko
@ 2022-06-18  7:01               ` Eli Zaretskii
  2022-06-18  7:23                 ` Ihor Radchenko
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-06-18  7:01 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: casouri, emacs-devel

> From: Ihor Radchenko <yantar92@gmail.com>
> Cc: casouri@gmail.com,  emacs-devel@gnu.org
> Date: Sat, 18 Jun 2022 13:52:59 +0800
> 
> Org AST represents semantic objects using nested lists.
> Similar to tree-sitter (AFAIU), each object in the tree is represented
> by
> 
> (object-type (object-plist) object-children ...)
> 
> for example:
> 
> * test headline :tag:
> 
> is represented as
> 
> (headline
>   (:raw-value "test headline" :begin 292 :end 314 ... :tags ("tag") ... :parent (...))
>   ;; no children
>    )
> 
> Upon modifying text inside the headline, we need to update :begin/:end
> properties to reflect the new headline boundaries in buffer and possibly
> update headline properties (e.g. :tags).
> 
> The same should be done for all the elements containing the headline.

Where you care about changes in buffer positions of the AST elements,
markers should take care of most, if not all, of them.  I presume you
already do use markers wherever possible? if not, why not?  Or what am
I missing?

> Updating the elements require the following information:
> 
> 1. Whether modified text contained terminal symbols or text contributing
>    to object-plist _before_ modification.
> 2. The boundaries of the edited text in buffer and change in the text
>    length.
> 3. Whether the modified text contain terminal symbols/text contributing
>    to object-plist _after_ modification.
> 
> Org does not care about text property changes or overlay changes.
> We just perform a series of regexp searches over the changed parts of
> buffer (possibly with extended boundaries) before and after the
> modification + know which region of text has been modified (its begin,
> end, and change in length).
> 
> Missing any significant change (the one involving terminal symbols or
> changing region length) will make the AST invalid.

Why would you miss significant changes if you base your implementation
on buffer-modification hooks?  If there are some situations where
buffer text is modified in ways that are significant for the update
of the AST, but buffer-modification hooks are NOT called, please
describe some of those situations, so we will have something concrete
to talk about.

IOW, I still don't see from the above description why markers and
buffer-modification hooks couldn't do the job, and you would need a
lower-level hook into buffer text change machinery.  I guess that
would require a more detailed description of the job at hand?



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Exposing buffer text modifications to Lisp (was: Tree-sitter integration on feature/tree-sitter)
  2022-06-18  7:01               ` Eli Zaretskii
@ 2022-06-18  7:23                 ` Ihor Radchenko
  2022-06-18  7:44                   ` Eli Zaretskii
  0 siblings, 1 reply; 187+ messages in thread
From: Ihor Radchenko @ 2022-06-18  7:23 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: casouri, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

>> (headline
>>   (:raw-value "test headline" :begin 292 :end 314 ... :tags ("tag") ... :parent (...))
>>   ;; no children
>>    )
>> 
>> Upon modifying text inside the headline, we need to update :begin/:end
>> properties to reflect the new headline boundaries in buffer and possibly
>> update headline properties (e.g. :tags).
>> 
>> The same should be done for all the elements containing the headline.
>
> Where you care about changes in buffer positions of the AST elements,
> markers should take care of most, if not all, of them.  I presume you
> already do use markers wherever possible? if not, why not?  Or what am
> I missing?

Because lots of markers degrade Emacs regex search performance
tremendously.

See https://list.orgmode.org/orgmode/scedec$2g0$1@ciao.gmane.io/
and https://orgmode.org/list/87y21wkdwu.fsf@localhost

>> Missing any significant change (the one involving terminal symbols or
>> changing region length) will make the AST invalid.
>
> Why would you miss significant changes if you base your implementation
> on buffer-modification hooks?  If there are some situations where
> buffer text is modified in ways that are significant for the update
> of the AST, but buffer-modification hooks are NOT called, please
> describe some of those situations, so we will have something concrete
> to talk about.

The situation is third-party code doing bloody murder with

(with-silent-modifications
 (insert "Some text not triggering modification hooks))

Another scenario is modifying text in indirect buffers created with
make-indirect-buffer. (where there is no chance to install
before/after-change-functions via clone-indirect-buffer-hook).

Best,
Ihor



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Exposing buffer text modifications to Lisp (was: Tree-sitter integration on feature/tree-sitter)
  2022-06-18  7:23                 ` Ihor Radchenko
@ 2022-06-18  7:44                   ` Eli Zaretskii
  2022-06-18  8:13                     ` Ihor Radchenko
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-06-18  7:44 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: casouri, emacs-devel

> From: Ihor Radchenko <yantar92@gmail.com>
> Cc: casouri@gmail.com,  emacs-devel@gnu.org
> Date: Sat, 18 Jun 2022 15:23:51 +0800
> 
> > Where you care about changes in buffer positions of the AST elements,
> > markers should take care of most, if not all, of them.  I presume you
> > already do use markers wherever possible? if not, why not?  Or what am
> > I missing?
> 
> Because lots of markers degrade Emacs regex search performance
> tremendously.
> 
> See https://list.orgmode.org/orgmode/scedec$2g0$1@ciao.gmane.io/
> and https://orgmode.org/list/87y21wkdwu.fsf@localhost

AFAIU, the right fix for this is to fix performance degradation when a
buffer has many markers, not avoiding the use of markers.

Here's one conclusion from this discussion that indicates changes
required to be done in core (other than a low-level modification hook
for buffer text) to take care of your AST implementation.

We already have a TODO item for making markers more efficient; any
takers?

> > Why would you miss significant changes if you base your implementation
> > on buffer-modification hooks?  If there are some situations where
> > buffer text is modified in ways that are significant for the update
> > of the AST, but buffer-modification hooks are NOT called, please
> > describe some of those situations, so we will have something concrete
> > to talk about.
> 
> The situation is third-party code doing bloody murder with
> 
> (with-silent-modifications
>  (insert "Some text not triggering modification hooks))
> 
> Another scenario is modifying text in indirect buffers created with
> make-indirect-buffer. (where there is no chance to install
> before/after-change-functions via clone-indirect-buffer-hook).

In at least the latter case the idea for a proper solution was
outlined by Stefan.

For other cases, I think a careful discussion on a case by case basis
will show the path towards solving each one of them.  It is possible
that some of them require further changes in core, but we won't know
until we discuss the details of each case.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Exposing buffer text modifications to Lisp (was: Tree-sitter integration on feature/tree-sitter)
  2022-06-18  7:44                   ` Eli Zaretskii
@ 2022-06-18  8:13                     ` Ihor Radchenko
  2022-06-18  8:47                       ` Exposing buffer text modifications to Lisp Eli Zaretskii
  0 siblings, 1 reply; 187+ messages in thread
From: Ihor Radchenko @ 2022-06-18  8:13 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: casouri, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

>> Because lots of markers degrade Emacs regex search performance
>> tremendously.
>> 
>> See https://list.orgmode.org/orgmode/scedec$2g0$1@ciao.gmane.io/
>> and https://orgmode.org/list/87y21wkdwu.fsf@localhost
>
> AFAIU, the right fix for this is to fix performance degradation when a
> buffer has many markers, not avoiding the use of markers.
>
> Here's one conclusion from this discussion that indicates changes
> required to be done in core (other than a low-level modification hook
> for buffer text) to take care of your AST implementation.
>
> We already have a TODO item for making markers more efficient; any
> takers?

This is trickier than it may appear.
Each element in Org AST has 3-7 markers.
My real-life large org buffer contains ~200k Org syntax elements
(actually more, but not all the elements are ever queried).
So, we are talking about 600k-1.4M markers in buffer if Org AST were to
use markers.

Now, imagine an edit somewhere near the beginning of Org buffer. Such
edit means that Emacs will have to shift positions of nearly all the
markers in the buffer. All the >1M markers. On every
self-insert-command.

Org parser goes around this issue by updating AST positions on idle and
maintaining asynchronous request queue. This works relatively well
because AST queries are skewed to be near the buffer region being
edited. I am not sure if similar approach (not trivial to start with)
can be efficiently utilized by Emacs. IDK the typical marker access
pattern in Emacs core.

Probably, Emacs may need to implement an alternative data structure to
store markers and allow efficient batch-shifting of the markers. Again,
not trivial.

>> The situation is third-party code doing bloody murder with
>> 
>> (with-silent-modifications
>>  (insert "Some text not triggering modification hooks))
>> 
>> Another scenario is modifying text in indirect buffers created with
>> make-indirect-buffer. (where there is no chance to install
>> before/after-change-functions via clone-indirect-buffer-hook).
>
> In at least the latter case the idea for a proper solution was
> outlined by Stefan.

I haven't read through his email carefully yet. A quick response is that
I have seen a lot of code in the wild that simply uses
make-indirect-buffer. Expecting compliance is unreliable in practice. (I
may need to think more about this though)

Best,
Ihor



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-18  6:22             ` Eli Zaretskii
@ 2022-06-18  8:25               ` Yuan Fu
  2022-06-18  8:50                 ` Eli Zaretskii
  0 siblings, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-06-18  8:25 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: emacs-devel



> On Jun 17, 2022, at 11:22 PM, Eli Zaretskii <eliz@gnu.org> wrote:
> 
>> From: Yuan Fu <casouri@gmail.com>
>> Date: Fri, 17 Jun 2022 17:14:58 -0700
>> Cc: emacs-devel@gnu.org
>> 
>>>>>> Moving forward, I want to make treesit-parser-list internal and turn it into a function that returns the parser list. And add a function to remove a parser from the parser list.
>>>>> 
>>>>> And a function to add to the list, right?  Or does it already exist?
>>>> 
>>>> Creating a parser automatically adds it to the parser list of a buffer.
>>> 
>>> Then removing a parser means we actually delete it?
>> 
>> Not sure what do you men “delete”.
> 
> If creating a parser adds it to the list, then I guessed the semantics
> of removing from the list is the opposite: having the parser no longer
> exist, i.e. "delete" it.
> 
> But now I'm confused by what you say here:
> 
>> Treesit-parser-delete removes the parser from the parser list of a buffer, so it is never kept up-to-date with that buffer again. But you can still do stuff with it until it is gc’ed.
> 
> If we already have treesit-parser-delete, and that call removes the
> parser from the list, then why would we need a function "to remove a
> parser from the list"?  It sounds like treesit-parser-delete already
> does it?

Yeah. There is no other function, treesit-parser-delete deletes and removes the parser. Though I don’t know how can you make a Lisp_Object “no longer exist”. Normally we just turn #<stuff> into #<deleted stuff>, right?

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-18  8:13                     ` Ihor Radchenko
@ 2022-06-18  8:47                       ` Eli Zaretskii
  2022-06-20 11:58                         ` Ihor Radchenko
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-06-18  8:47 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: casouri, emacs-devel

> From: Ihor Radchenko <yantar92@gmail.com>
> Cc: casouri@gmail.com,  emacs-devel@gnu.org
> Date: Sat, 18 Jun 2022 16:13:13 +0800
> 
> > AFAIU, the right fix for this is to fix performance degradation when a
> > buffer has many markers, not avoiding the use of markers.
> >
> > Here's one conclusion from this discussion that indicates changes
> > required to be done in core (other than a low-level modification hook
> > for buffer text) to take care of your AST implementation.
> >
> > We already have a TODO item for making markers more efficient; any
> > takers?
> 
> This is trickier than it may appear.
> Each element in Org AST has 3-7 markers.
> My real-life large org buffer contains ~200k Org syntax elements
> (actually more, but not all the elements are ever queried).
> So, we are talking about 600k-1.4M markers in buffer if Org AST were to
> use markers.
> 
> Now, imagine an edit somewhere near the beginning of Org buffer. Such
> edit means that Emacs will have to shift positions of nearly all the
> markers in the buffer. All the >1M markers. On every
> self-insert-command.

The inner loop of adjust_markers_for_insert is just 40 machine
instructions.  (This is in unoptimized code; it could be fewer
instruction in an optimized build.)  Assuming a 3GHz CPU clock, 40
instructions should take just 13 nsec, and 1 million of these should
take 13 milliseconds -- a very short time indeed.  I expect that to be
between 5 and 7 msec in an optimized build.

(Compare that with inserting the characters itself: the first
insertion could potentially mean moving the gap, which in a large
buffer means moving megabytes of bytes -- not a negligible feat.)

So I don't think the performance degradation due to markers is because
the insert/delete operations on buffer text need to update many
markers.  I think the real slowdown comes from the functions which
convert character positions to byte positions and vice versa: these
use markers.  There are a lot of such calls all over our code, and
that's where the current linear-linked-list implementation of markers
slows us down.

Of course, the right method to show the bottleneck(s) is to profile
the code with a tool like 'prof', and take it from there.  So here's
one more interesting job for someone to volunteer.

> Org parser goes around this issue by updating AST positions on idle and
> maintaining asynchronous request queue. This works relatively well
> because AST queries are skewed to be near the buffer region being
> edited. I am not sure if similar approach (not trivial to start with)
> can be efficiently utilized by Emacs. IDK the typical marker access
> pattern in Emacs core.

If you already have a workaround for marker-related problems, then why
do you need to hook into insertion and deletion on the lowest level?

> >> The situation is third-party code doing bloody murder with
> >> 
> >> (with-silent-modifications
> >>  (insert "Some text not triggering modification hooks))
> >> 
> >> Another scenario is modifying text in indirect buffers created with
> >> make-indirect-buffer. (where there is no chance to install
> >> before/after-change-functions via clone-indirect-buffer-hook).
> >
> > In at least the latter case the idea for a proper solution was
> > outlined by Stefan.
> 
> I haven't read through his email carefully yet. A quick response is that
> I have seen a lot of code in the wild that simply uses
> make-indirect-buffer. Expecting compliance is unreliable in practice. (I
> may need to think more about this though)

If this is a frequent problem, perhaps we should introduce a special
variant of add-hook which would cater to the indirect-buffer case.
Discussion of several such cases could point to that conclusion, or it
can point to something else.

And that is my long-standing gripe aimed at developers of 3rd party
packages: they should come here (or bug-gnu-emacs@gnu.org) and present
the cases where they needed some missing infrastructure, instead of
trying to jump through hoops to work around what they perceive as
Emacs restrictions that (they think) cannot be possibly lifted.  Doing
the former will have at least two benefits: (a) it will facilitate
Emacs development into a better platform, and (b) it will avoid giving
birth to some of the horrible kludges out there, which eventually
don't work well enough, and thus make Emacs seem less professional
than it should be.

And if that is my expectation from developers of 3rd party packages, I
definitely expect that from packages that are bundled, such as Org.
Since Org is basically part of the core Emacs, it makes little sense
to me to realize that it goes to such lengths trying to work around
the limitations, instead of asking the core team to improve the
existing implementation or add some missing ones.  I could perhaps
understand if the request existed, but no one volunteered to work on
it, but not having the requests in the first place I cannot
understand.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-18  8:25               ` Yuan Fu
@ 2022-06-18  8:50                 ` Eli Zaretskii
  2022-06-18 20:07                   ` Yuan Fu
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-06-18  8:50 UTC (permalink / raw)
  To: Yuan Fu; +Cc: emacs-devel

> From: Yuan Fu <casouri@gmail.com>
> Date: Sat, 18 Jun 2022 01:25:01 -0700
> Cc: emacs-devel@gnu.org
> 
> >>> Then removing a parser means we actually delete it?
> >> 
> >> Not sure what do you men “delete”.
> > 
> > If creating a parser adds it to the list, then I guessed the semantics
> > of removing from the list is the opposite: having the parser no longer
> > exist, i.e. "delete" it.
> > 
> > But now I'm confused by what you say here:
> > 
> >> Treesit-parser-delete removes the parser from the parser list of a buffer, so it is never kept up-to-date with that buffer again. But you can still do stuff with it until it is gc’ed.
> > 
> > If we already have treesit-parser-delete, and that call removes the
> > parser from the list, then why would we need a function "to remove a
> > parser from the list"?  It sounds like treesit-parser-delete already
> > does it?
> 
> Yeah. There is no other function, treesit-parser-delete deletes and removes the parser.

So you agree with me that a function to remove from the list is not
needed?  Once the list is no longer exposed to Lisp, the way Lisp
programs should manipulate the list is by adding and deleting parsers,
and by asking Emacs to show the list of existing parsers.  Right?

> Though I don’t know how can you make a Lisp_Object “no longer exist”. Normally we just turn #<stuff> into #<deleted stuff>, right?

Yes, we leave the actual deleting to GC.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-18  8:50                 ` Eli Zaretskii
@ 2022-06-18 20:07                   ` Yuan Fu
  2022-06-19  5:39                     ` Eli Zaretskii
  0 siblings, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-06-18 20:07 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: emacs-devel



> On Jun 18, 2022, at 1:50 AM, Eli Zaretskii <eliz@gnu.org> wrote:
> 
>> From: Yuan Fu <casouri@gmail.com>
>> Date: Sat, 18 Jun 2022 01:25:01 -0700
>> Cc: emacs-devel@gnu.org
>> 
>>>>> Then removing a parser means we actually delete it?
>>>> 
>>>> Not sure what do you men “delete”.
>>> 
>>> If creating a parser adds it to the list, then I guessed the semantics
>>> of removing from the list is the opposite: having the parser no longer
>>> exist, i.e. "delete" it.
>>> 
>>> But now I'm confused by what you say here:
>>> 
>>>> Treesit-parser-delete removes the parser from the parser list of a buffer, so it is never kept up-to-date with that buffer again. But you can still do stuff with it until it is gc’ed.
>>> 
>>> If we already have treesit-parser-delete, and that call removes the
>>> parser from the list, then why would we need a function "to remove a
>>> parser from the list"?  It sounds like treesit-parser-delete already
>>> does it?
>> 
>> Yeah. There is no other function, treesit-parser-delete deletes and removes the parser.
> 
> So you agree with me that a function to remove from the list is not
> needed?  Once the list is no longer exposed to Lisp, the way Lisp
> programs should manipulate the list is by adding and deleting parsers,
> and by asking Emacs to show the list of existing parsers.  Right?

I don’t think we have any disagreement here, it’s just my miscommunication. We have three functions:
- treesit-parser-create that creates a parser and adds it to the parser list
- treesit-parser-delete that deletes a parser and removes it from the parser list
- treesit-parser-list that returns the parser list

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-18  0:28       ` Yuan Fu
@ 2022-06-18 20:57         ` Jostein Kjønigsen
  0 siblings, 0 replies; 187+ messages in thread
From: Jostein Kjønigsen @ 2022-06-18 20:57 UTC (permalink / raw)
  To: Yuan Fu, jostein; +Cc: Emacs Devel, Yoav Marco

[-- Attachment #1: Type: text/plain, Size: 759 bytes --]


On 18.06.2022 02:28, Yuan Fu wrote:
>
> I’ve figure out the problem. It is due to my misunderstanding of how gc works. I’ve pushed a fix.
>
> On a separate note, I also pushed the change that makes treesit-parser-list a function (rather than a variable).
>
> Yuan

Just FYI: Getting latest sources and compiling from scratch, I get the 
following build warning:

    In treesit-traverse-forward:
    treesit.el:291:2: Warning: docstring has wrong usage of unescaped
    single quotes (use \= or different quoting)

Besides that, I've tested your changes and they definitely fix the 
segfault. Great stuff!

-- 
Vennlig hilsen
*Jostein Kjønigsen*

jostein@kjonigsen.net 🍵 jostein@gmail.com
https://jostein.kjønigsen.no <https://jostein.kjønigsen.no>

[-- Attachment #2: Type: text/html, Size: 1592 bytes --]

^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-18 20:07                   ` Yuan Fu
@ 2022-06-19  5:39                     ` Eli Zaretskii
  2022-06-20  3:00                       ` Yuan Fu
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-06-19  5:39 UTC (permalink / raw)
  To: Yuan Fu; +Cc: emacs-devel

> From: Yuan Fu <casouri@gmail.com>
> Date: Sat, 18 Jun 2022 13:07:07 -0700
> Cc: emacs-devel@gnu.org
> 
> > So you agree with me that a function to remove from the list is not
> > needed?  Once the list is no longer exposed to Lisp, the way Lisp
> > programs should manipulate the list is by adding and deleting parsers,
> > and by asking Emacs to show the list of existing parsers.  Right?
> 
> I don’t think we have any disagreement here, it’s just my miscommunication. We have three functions:
> - treesit-parser-create that creates a parser and adds it to the parser list
> - treesit-parser-delete that deletes a parser and removes it from the parser list
> - treesit-parser-list that returns the parser list

Right, that's exactly what I meant.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-19  5:39                     ` Eli Zaretskii
@ 2022-06-20  3:00                       ` Yuan Fu
  2022-06-20 11:44                         ` Eli Zaretskii
  0 siblings, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-06-20  3:00 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: emacs-devel


I added navigation functions like treesit-beginning/end-of-defun, and added search functions like treesit-search-beginning/end. Now I wonder where should I put them in the manual, do I put them under the treesit section (Parsing Program Source), or under the relevant existing sections in the manual? By revenant sections I mean put treesit-beginning/end-of-defun in the same section as beginning/end-of-defun, etc.

Treesit-beginning/end-of-defun jumps to the beginning/end of the current defun form, treesit-search-beginning searches for a query and stops at the beginning/end of the node that matched the query.

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-20  3:00                       ` Yuan Fu
@ 2022-06-20 11:44                         ` Eli Zaretskii
  2022-06-20 20:01                           ` Yuan Fu
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-06-20 11:44 UTC (permalink / raw)
  To: Yuan Fu; +Cc: emacs-devel

> From: Yuan Fu <casouri@gmail.com>
> Date: Sun, 19 Jun 2022 20:00:49 -0700
> Cc: emacs-devel@gnu.org
> 
> 
> I added navigation functions like treesit-beginning/end-of-defun, and added search functions like treesit-search-beginning/end. Now I wonder where should I put them in the manual, do I put them under the treesit section (Parsing Program Source), or under the relevant existing sections in the manual?

The latter, please.

But why do we need a separate description for the tree-sitter
variants?  Shouldn't that be automatically supported by
beginning/end-of-defun, once some switch is thrown to enable
tree-sitter?

And if beginning/end-of-defun is for some reason too low-level/basic
for this role (but if you think so, please explain why), then I think
we need higher-level functions that by default are just thin wrappers
around beginning/end-of-defun, and will call tree-sitter versions when
Emacs is configured to do so.

I mean, it would be very cumbersome to request that each and every
major mode which wants to use tree-sitter will have to explicitly call
treesit-SOMETHING everywhere.

> Treesit-beginning/end-of-defun jumps to the beginning/end of the current defun form, treesit-search-beginning searches for a query and stops at the beginning/end of the node that matched the query.

So you are saying treesit-beginning/end-of-defun don't actually look
for beginning and end of a function, but for beginning and end of a
more abstract entity?  Then perhaps it would be wrong to have "defun"
in their names?  And in that case, maybe a separate section (under
"Motion") is better after all, since this is no longer "List Motion",
strictly speaking.

Thanks.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-18  8:47                       ` Exposing buffer text modifications to Lisp Eli Zaretskii
@ 2022-06-20 11:58                         ` Ihor Radchenko
  2022-06-20 12:32                           ` Eli Zaretskii
  2022-06-20 14:33                           ` Alan Mackenzie
  0 siblings, 2 replies; 187+ messages in thread
From: Ihor Radchenko @ 2022-06-20 11:58 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: casouri, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

>> > We already have a TODO item for making markers more efficient; any
>> > takers?
>> 
>> This is trickier than it may appear.
>> Each element in Org AST has 3-7 markers.
>> My real-life large org buffer contains ~200k Org syntax elements
>> (actually more, but not all the elements are ever queried).
>> So, we are talking about 600k-1.4M markers in buffer if Org AST were to
>> use markers.
>> 
>> Now, imagine an edit somewhere near the beginning of Org buffer. Such
>> edit means that Emacs will have to shift positions of nearly all the
>> markers in the buffer. All the >1M markers. On every
>> self-insert-command.
>
> The inner loop of adjust_markers_for_insert is just 40 machine
> instructions.  (This is in unoptimized code; it could be fewer
> instruction in an optimized build.)  Assuming a 3GHz CPU clock, 40
> instructions should take just 13 nsec, and 1 million of these should
> take 13 milliseconds -- a very short time indeed.  I expect that to be
> between 5 and 7 msec in an optimized build.
>
> (Compare that with inserting the characters itself: the first
> insertion could potentially mean moving the gap, which in a large
> buffer means moving megabytes of bytes -- not a negligible feat.)

Noted.
Does Emacs C code provide any generic tree structure implementation?

> So I don't think the performance degradation due to markers is because
> the insert/delete operations on buffer text need to update many
> markers.  I think the real slowdown comes from the functions which
> convert character positions to byte positions and vice versa: these
> use markers.  There are a lot of such calls all over our code, and
> that's where the current linear-linked-list implementation of markers
> slows us down.
>
> Of course, the right method to show the bottleneck(s) is to profile
> the code with a tool like 'prof', and take it from there.  So here's
> one more interesting job for someone to volunteer.

That's what I did in https://orgmode.org/list/87y21wkdwu.fsf@localhost:

>>> The bottleneck appears to be buf_bytepos_to_charpos, called by
>>> BYTE_TO_CHAR macro, which, in turn, is used by set_search_regs

>>> buf_bytepos_to_charpos contains the following loop:
>>> 
>>>   for (tail = BUF_MARKERS (b); tail; tail = tail->next)
>>>     {
>>>       CONSIDER (tail->bytepos, tail->charpos);
>>> 
>>>       /* If we are down to a range of 50 chars,
>>> 	 don't bother checking any other markers;
>>> 	 scan the intervening chars directly now.  */
>>>       if (best_above - bytepos < distance
>>>           || bytepos - best_below < distance)
>>> 	break;
>>>       else
>>>         distance += BYTECHAR_DISTANCE_INCREMENT;
>>>     }
>>> 
>>> I am not sure if I understand the code correctly, but that loop is
>>> clearly scaling performance with the number of markers

>> Org parser goes around this issue by updating AST positions on idle and
>> maintaining asynchronous request queue. This works relatively well
>> because AST queries are skewed to be near the buffer region being
>> edited. I am not sure if similar approach (not trivial to start with)
>> can be efficiently utilized by Emacs. IDK the typical marker access
>> pattern in Emacs core.
>
> If you already have a workaround for marker-related problems, then why
> do you need to hook into insertion and deletion on the lowest level?

Because the workaround relies on before/after-change-functions that may
be suppressed by bad third-party code.

Also, markers will not solve all the needs of Org parser even when they
become more efficient. As I mentioned earlier, we also need to keep
track whether terminal symbols appear in the changed text before/after
modification. It boils down to matching regexps around changed region in
buffer before/after each modification. Suppressed
before/after-change-functions ruin this logic as well.

> And that is my long-standing gripe aimed at developers of 3rd party
> packages: they should come here (or bug-gnu-emacs@gnu.org) and present
> the cases where they needed some missing infrastructure, instead of
> trying to jump through hoops to work around what they perceive as
> Emacs restrictions that (they think) cannot be possibly lifted.  Doing
> the former will have at least two benefits: (a) it will facilitate
> Emacs development into a better platform, and (b) it will avoid giving
> birth to some of the horrible kludges out there, which eventually
> don't work well enough, and thus make Emacs seem less professional
> than it should be.
>
> And if that is my expectation from developers of 3rd party packages, I
> definitely expect that from packages that are bundled, such as Org.
> Since Org is basically part of the core Emacs, it makes little sense
> to me to realize that it goes to such lengths trying to work around
> the limitations, instead of asking the core team to improve the
> existing implementation or add some missing ones.  I could perhaps
> understand if the request existed, but no one volunteered to work on
> it, but not having the requests in the first place I cannot
> understand.

I think I need to clarify my position here.

The important thing you need to know about Org is that it does not only
support Emacs version Org is bundled with.
We currently support Emacs >=26. See
https://orgmode.org/worg/org-maintenance.html#emacs-compatibility

So, any major feature implemented in the development version of Emacs
cannot be easily used. The new feature will mean doubling the relevant
code on Org side: (1) supporting the new feature; (2) compatibility
layer to support older Emacs versions. Which means extra maintenance.

When I am also asked to implement the patch for this new feature for
Emacs, I get triple work.

Moreover, my previous attempt to propose a patch required for Org was
sunk in the depths of emacs-devel threads. (It was a patch for
isearch.el and it does not apply anymore onto master. I plan to
re-submit it when I get more time and interest. Just FYI)

Having said that, I do know that it is a better thing to reach Emacs when
new feature is really beneficial. But I hope that my previous
explanation clarifies why there is a friction (at least, it is the case
for me personally) to contribute to Emacs. Emacs core-related items tend
to go down towards the end of todo lists.

Best,
Ihor




^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-20 11:58                         ` Ihor Radchenko
@ 2022-06-20 12:32                           ` Eli Zaretskii
  2022-06-20 14:14                             ` Stefan Kangas
                                               ` (2 more replies)
  2022-06-20 14:33                           ` Alan Mackenzie
  1 sibling, 3 replies; 187+ messages in thread
From: Eli Zaretskii @ 2022-06-20 12:32 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: casouri, emacs-devel

> From: Ihor Radchenko <yantar92@gmail.com>
> Cc: casouri@gmail.com,  emacs-devel@gnu.org
> Date: Mon, 20 Jun 2022 19:58:31 +0800
> 
> Does Emacs C code provide any generic tree structure implementation?

We have interval trees and red-black trees, but they are used for
specific C-level features, and I wouldn't call them "generic".

OTOH, don't you want to create a Lisp structure to represent AST?  If
so, C-level tree will not really help you, would it?

And I'm not sure it would be a good idea to have the trees in C even
if it would help: that's not the right subdivision between C and Lisp,
IME.  I'm quite sure your Lisp-level AST is fine for your purposes,
you just want to be able to update it more efficiently and not as
painfully as you do now.  Right?  If so, what we should do in C is to
enable that more efficient and less painful implementation, not to
provide the implementation itself.

> > Of course, the right method to show the bottleneck(s) is to profile
> > the code with a tool like 'prof', and take it from there.  So here's
> > one more interesting job for someone to volunteer.
> 
> That's what I did in https://orgmode.org/list/87y21wkdwu.fsf@localhost:
> 
> >>> The bottleneck appears to be buf_bytepos_to_charpos, called by
> >>> BYTE_TO_CHAR macro, which, in turn, is used by set_search_regs
> 
> >>> buf_bytepos_to_charpos contains the following loop:
> >>> 
> >>>   for (tail = BUF_MARKERS (b); tail; tail = tail->next)
> >>>     {
> >>>       CONSIDER (tail->bytepos, tail->charpos);
> >>> 
> >>>       /* If we are down to a range of 50 chars,
> >>> 	 don't bother checking any other markers;
> >>> 	 scan the intervening chars directly now.  */
> >>>       if (best_above - bytepos < distance
> >>>           || bytepos - best_below < distance)
> >>> 	break;
> >>>       else
> >>>         distance += BYTECHAR_DISTANCE_INCREMENT;
> >>>     }
> >>> 
> >>> I am not sure if I understand the code correctly, but that loop is
> >>> clearly scaling performance with the number of markers

Thanks, this seems to confirm my guess that these conversions are the
bottleneck.  In which case making marker access and search more
efficient will go a long way toward helping Org's AST parsing to
become more performant and eventually more easily maintainable.

> > If you already have a workaround for marker-related problems, then why
> > do you need to hook into insertion and deletion on the lowest level?
> 
> Because the workaround relies on before/after-change-functions that may
> be suppressed by bad third-party code.

Understood.

> Also, markers will not solve all the needs of Org parser even when they
> become more efficient. As I mentioned earlier, we also need to keep
> track whether terminal symbols appear in the changed text before/after
> modification. It boils down to matching regexps around changed region in
> buffer before/after each modification. Suppressed
> before/after-change-functions ruin this logic as well.

I asked a question about that, but you said you wanted to answer the
AST-related parts first.  So can we now go back to this aspect to
understand it better?  Emacs inhibits buffer-modification hooks when
it is quite sure Lisp programs "don't need to know" about those
modifications.  One example you cited where this bites you is use of
input methods.  But Quail doesn't inhibit the hooks completely, it
only inhibits them enough to pretend that just one character was
inserted, when the user might have inserted more.  So why does this
get in the way of the Org parser, if the modification hooks are being
called "enough"?

> > And that is my long-standing gripe aimed at developers of 3rd party
> > packages: they should come here (or bug-gnu-emacs@gnu.org) and present
> > the cases where they needed some missing infrastructure, instead of
> > trying to jump through hoops to work around what they perceive as
> > Emacs restrictions that (they think) cannot be possibly lifted.  Doing
> > the former will have at least two benefits: (a) it will facilitate
> > Emacs development into a better platform, and (b) it will avoid giving
> > birth to some of the horrible kludges out there, which eventually
> > don't work well enough, and thus make Emacs seem less professional
> > than it should be.
> >
> > And if that is my expectation from developers of 3rd party packages, I
> > definitely expect that from packages that are bundled, such as Org.
> > Since Org is basically part of the core Emacs, it makes little sense
> > to me to realize that it goes to such lengths trying to work around
> > the limitations, instead of asking the core team to improve the
> > existing implementation or add some missing ones.  I could perhaps
> > understand if the request existed, but no one volunteered to work on
> > it, but not having the requests in the first place I cannot
> > understand.
> 
> I think I need to clarify my position here.
> 
> The important thing you need to know about Org is that it does not only
> support Emacs version Org is bundled with.
> We currently support Emacs >=26. See
> https://orgmode.org/worg/org-maintenance.html#emacs-compatibility
> 
> So, any major feature implemented in the development version of Emacs
> cannot be easily used. The new feature will mean doubling the relevant
> code on Org side: (1) supporting the new feature; (2) compatibility
> layer to support older Emacs versions. Which means extra maintenance.

That's true, but the same is true for any new feature added to Emacs:
they can only be used since the first version which added them.  And
yet you still are asking us to provide such new features, like that
super-buffer-modification hook.

Thus, the difference between these two approaches is not whether or
not to add new features to core (which understandably makes the job of
developers of packages like Org harder due to support of older
Emacsen), the difference is _which_ new features to add.  I'm saying
that it is much better to add features which will avoid your jumping
through hoops, instead of adding features that will allow you to jump
through hoops faster and better, so to say.  It is better also in the
long run, because it helps Emacs development as well, and it helps you
and other 3rd party packages that will be able to use those new
features for future implementations.

> Moreover, my previous attempt to propose a patch required for Org was
> sunk in the depths of emacs-devel threads. (It was a patch for
> isearch.el and it does not apply anymore onto master. I plan to
> re-submit it when I get more time and interest. Just FYI)

This can unfortunately happen with any discussion, and is not always
under our control.  Perseverance is the only way I know of to prevail
in those cases.

> Emacs core-related items tend to go down towards the end of todo
> lists.

We don't have enough resources, it's true.  Hopefully, this won't
prevent people from raising such issues.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-20 12:32                           ` Eli Zaretskii
@ 2022-06-20 14:14                             ` Stefan Kangas
  2022-06-21  3:56                               ` Ihor Radchenko
  2022-06-21  4:36                             ` Ihor Radchenko
  2022-06-22 15:45                             ` Exposing buffer text modifications to Lisp Basil L. Contovounesios
  2 siblings, 1 reply; 187+ messages in thread
From: Stefan Kangas @ 2022-06-20 14:14 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: Ihor Radchenko, Yuan Fu, Emacs developers

Eli Zaretskii <eliz@gnu.org> writes:

> > Moreover, my previous attempt to propose a patch required for Org was
> > sunk in the depths of emacs-devel threads.
>
> This can unfortunately happen with any discussion, and is not always
> under our control.  Perseverance is the only way I know of to prevail
> in those cases.

I very much recommend sending patches to bug-gnu-emacs instead of
emacs-devel.  That decreases the risk of them getting forgotten or
lost in the noise by orders of magnitude.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-20 11:58                         ` Ihor Radchenko
  2022-06-20 12:32                           ` Eli Zaretskii
@ 2022-06-20 14:33                           ` Alan Mackenzie
  2022-06-21  3:58                             ` Ihor Radchenko
  1 sibling, 1 reply; 187+ messages in thread
From: Alan Mackenzie @ 2022-06-20 14:33 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: Eli Zaretskii, casouri, emacs-devel

Hello, Ihor.

On Mon, Jun 20, 2022 at 19:58:31 +0800, Ihor Radchenko wrote:
> Eli Zaretskii <eliz@gnu.org> writes:

[ .... ]

> > If you already have a workaround for marker-related problems, then why
> > do you need to hook into insertion and deletion on the lowest level?

> Because the workaround relies on before/after-change-functions that may
> be suppressed by bad third-party code.

Why is that your (or our) problem?  Code which isn't the major mode
masking out the change functions is just invalid code.  Can't you just
document somewhere that before/after-change-functions are an essential
part of Org Mode, and that messing around with them will lead to
unpredictable results?

[ .... ]

> Best,
> Ihor

-- 
Alan Mackenzie (Nuremberg, Germany).



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-20 11:44                         ` Eli Zaretskii
@ 2022-06-20 20:01                           ` Yuan Fu
  2022-06-21  2:26                             ` Eli Zaretskii
  0 siblings, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-06-20 20:01 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: emacs-devel



> On Jun 20, 2022, at 4:44 AM, Eli Zaretskii <eliz@gnu.org> wrote:
> 
>> From: Yuan Fu <casouri@gmail.com>
>> Date: Sun, 19 Jun 2022 20:00:49 -0700
>> Cc: emacs-devel@gnu.org
>> 
>> 
>> I added navigation functions like treesit-beginning/end-of-defun, and added search functions like treesit-search-beginning/end. Now I wonder where should I put them in the manual, do I put them under the treesit section (Parsing Program Source), or under the relevant existing sections in the manual?
> 
> The latter, please.
> 
> But why do we need a separate description for the tree-sitter
> variants?  Shouldn't that be automatically supported by
> beginning/end-of-defun, once some switch is thrown to enable
> tree-sitter?
> 
> And if beginning/end-of-defun is for some reason too low-level/basic
> for this role (but if you think so, please explain why), then I think
> we need higher-level functions that by default are just thin wrappers
> around beginning/end-of-defun, and will call tree-sitter versions when
> Emacs is configured to do so.
> 
> I mean, it would be very cumbersome to request that each and every
> major mode which wants to use tree-sitter will have to explicitly call
> treesit-SOMETHING everywhere.

Major mode should set beginning-of-defun-function to treesit-beginning-of-defun, not unlike what they already do with major mode-specific beginning-of-defun functions. This way major mode has the freedom to decide which treesit features it wants to leverage.

> 
>> Treesit-beginning/end-of-defun jumps to the beginning/end of the current defun form, treesit-search-beginning searches for a query and stops at the beginning/end of the node that matched the query.
> 
> So you are saying treesit-beginning/end-of-defun don't actually look
> for beginning and end of a function, but for beginning and end of a
> more abstract entity?  Then perhaps it would be wrong to have "defun"
> in their names?  And in that case, maybe a separate section (under
> "Motion") is better after all, since this is no longer "List Motion",
> strictly speaking.

The second sentence describes treesit-search-beginning, not treesit-beginning-of-defun, I think you confused the two?

Thanks,
Yuan




^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-20 20:01                           ` Yuan Fu
@ 2022-06-21  2:26                             ` Eli Zaretskii
  2022-06-21  4:39                               ` Yuan Fu
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-06-21  2:26 UTC (permalink / raw)
  To: Yuan Fu; +Cc: emacs-devel

> From: Yuan Fu <casouri@gmail.com>
> Date: Mon, 20 Jun 2022 13:01:32 -0700
> Cc: emacs-devel@gnu.org
> 
> Major mode should set beginning-of-defun-function to treesit-beginning-of-defun, not unlike what they already do with major mode-specific beginning-of-defun functions. This way major mode has the freedom to decide which treesit features it wants to leverage.

Then those tree-sitter functions should be described together with
beginning/end-of-defun, I think.

> >> Treesit-beginning/end-of-defun jumps to the beginning/end of the current defun form, treesit-search-beginning searches for a query and stops at the beginning/end of the node that matched the query.
> > 
> > So you are saying treesit-beginning/end-of-defun don't actually look
> > for beginning and end of a function, but for beginning and end of a
> > more abstract entity?  Then perhaps it would be wrong to have "defun"
> > in their names?  And in that case, maybe a separate section (under
> > "Motion") is better after all, since this is no longer "List Motion",
> > strictly speaking.
> 
> The second sentence describes treesit-search-beginning, not treesit-beginning-of-defun, I think you confused the two?

Maybe so, but why did you mention treesit-search-beginning in this
context to begin with?



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-20 14:14                             ` Stefan Kangas
@ 2022-06-21  3:56                               ` Ihor Radchenko
  0 siblings, 0 replies; 187+ messages in thread
From: Ihor Radchenko @ 2022-06-21  3:56 UTC (permalink / raw)
  To: Stefan Kangas; +Cc: Eli Zaretskii, Yuan Fu, Emacs developers

Stefan Kangas <stefan@marxist.se> writes:

> I very much recommend sending patches to bug-gnu-emacs instead of
> emacs-devel.  That decreases the risk of them getting forgotten or
> lost in the noise by orders of magnitude.

Thanks! I will keep this in mind.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-20 14:33                           ` Alan Mackenzie
@ 2022-06-21  3:58                             ` Ihor Radchenko
  0 siblings, 0 replies; 187+ messages in thread
From: Ihor Radchenko @ 2022-06-21  3:58 UTC (permalink / raw)
  To: Alan Mackenzie; +Cc: Eli Zaretskii, casouri, emacs-devel

Alan Mackenzie <acm@muc.de> writes:

> On Mon, Jun 20, 2022 at 19:58:31 +0800, Ihor Radchenko wrote:
>> Eli Zaretskii <eliz@gnu.org> writes:
>
> [ .... ]
>
>> > If you already have a workaround for marker-related problems, then why
>> > do you need to hook into insertion and deletion on the lowest level?
>
>> Because the workaround relies on before/after-change-functions that may
>> be suppressed by bad third-party code.
>
> Why is that your (or our) problem?  Code which isn't the major mode
> masking out the change functions is just invalid code.  Can't you just
> document somewhere that before/after-change-functions are an essential
> part of Org Mode, and that messing around with them will lead to
> unpredictable results?

It is indeed possible. However, this particular issue can cause data
loss in user files. Which I'd prefer to avoid at all costs.

Best,
Ihor



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-20 12:32                           ` Eli Zaretskii
  2022-06-20 14:14                             ` Stefan Kangas
@ 2022-06-21  4:36                             ` Ihor Radchenko
  2022-06-21 12:27                               ` Eli Zaretskii
  2022-06-22 15:45                             ` Exposing buffer text modifications to Lisp Basil L. Contovounesios
  2 siblings, 1 reply; 187+ messages in thread
From: Ihor Radchenko @ 2022-06-21  4:36 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: casouri, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

>> Does Emacs C code provide any generic tree structure implementation?
>
> We have interval trees and red-black trees, but they are used for
> specific C-level features, and I wouldn't call them "generic".
>
> OTOH, don't you want to create a Lisp structure to represent AST?  If
> so, C-level tree will not really help you, would it?

Clarification: I was asking about C-level trees to store marker list.
I did not have moving Org AST from Lisp to C-level in mind. We currently
use built-in Lisp implementation of AVL-tree to search across AST (which
is not ideal, but good enough for moderately large files).

>> Also, markers will not solve all the needs of Org parser even when they
>> become more efficient. As I mentioned earlier, we also need to keep
>> track whether terminal symbols appear in the changed text before/after
>> modification. It boils down to matching regexps around changed region in
>> buffer before/after each modification. Suppressed
>> before/after-change-functions ruin this logic as well.
>
> I asked a question about that, but you said you wanted to answer the
> AST-related parts first.  So can we now go back to this aspect to
> understand it better?

This is still somewhat related to AST. AST object properties that do not
refer to positions in buffer may still need to be updated upon buffer
modification.

For example, consider

* TODO headline

being changed into

* DONE headline

with (with-silent-modifications (search-foward "TODO") (replace-match "DONE"))
or even simply by (replace-match ...) inside indirect buffer created by
direct call to make-indirect-buffer.

The AST headline object will need to be updated from
(headline (... :todo-keyword "TODO" ...))
to
(headline (... :todo-keyword "DONE" ...))

> Emacs inhibits buffer-modification hooks when
> it is quite sure Lisp programs "don't need to know" about those
> modifications.  One example you cited where this bites you is use of
> input methods.  But Quail doesn't inhibit the hooks completely, it
> only inhibits them enough to pretend that just one character was
> inserted, when the user might have inserted more.  So why does this
> get in the way of the Org parser, if the modification hooks are being
> called "enough"?

It does not. Given that I implement the suggestion about using
buffer-size to track "missed" modifications, Quail will not be an issue
anymore.

The only potential problem that will remain is the type of buffer
modifications I described above (shielded by inhibit-modification-hooks
or by being done inside indirect buffer). If such modifications do not
change the buffer size (as above), we still get a problem that may
(although less likely) cause data loss on user side.

> Thus, the difference between these two approaches is not whether or
> not to add new features to core (which understandably makes the job of
> developers of packages like Org harder due to support of older
> Emacsen), the difference is _which_ new features to add.  I'm saying
> that it is much better to add features which will avoid your jumping
> through hoops, instead of adding features that will allow you to jump
> through hoops faster and better, so to say.  It is better also in the
> long run, because it helps Emacs development as well, and it helps you
> and other 3rd party packages that will be able to use those new
> features for future implementations.

I totally agree. Though additional consideration is LOC cost of adding
new features. As you can see, I took a lazy approach in this request.
Adding a new hook would not require much code change on Org side. In
contrast, changing implementation to markers will actually require
careful testing and a lot more LOC changes. So, we have a clash between
"faster" and "better" :)

In any case, I totally get your position and I do know that Emacs core
should not accept low-quality features just because they are going to be
easier for some single specific use-case. I would do the same if I were
to maintain Emacs.

> This can unfortunately happen with any discussion, and is not always
> under our control.  Perseverance is the only way I know of to prevail
> in those cases.

I understand. Unfortunately, it also creates mental friction on my side
despite this understanding. I will submit patches via debbugs in future
to make things more visible.

Best,
Ihor



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-21  2:26                             ` Eli Zaretskii
@ 2022-06-21  4:39                               ` Yuan Fu
  2022-06-21 10:18                                 ` Eli Zaretskii
  0 siblings, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-06-21  4:39 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: emacs-devel



> On Jun 20, 2022, at 7:26 PM, Eli Zaretskii <eliz@gnu.org> wrote:
> 
>> From: Yuan Fu <casouri@gmail.com>
>> Date: Mon, 20 Jun 2022 13:01:32 -0700
>> Cc: emacs-devel@gnu.org
>> 
>> Major mode should set beginning-of-defun-function to treesit-beginning-of-defun, not unlike what they already do with major mode-specific beginning-of-defun functions. This way major mode has the freedom to decide which treesit features it wants to leverage.
> 
> Then those tree-sitter functions should be described together with
> beginning/end-of-defun, I think.

Cool.

> 
>>>> Treesit-beginning/end-of-defun jumps to the beginning/end of the current defun form, treesit-search-beginning searches for a query and stops at the beginning/end of the node that matched the query.
>>> 
>>> So you are saying treesit-beginning/end-of-defun don't actually look
>>> for beginning and end of a function, but for beginning and end of a
>>> more abstract entity?  Then perhaps it would be wrong to have "defun"
>>> in their names?  And in that case, maybe a separate section (under
>>> "Motion") is better after all, since this is no longer "List Motion",
>>> strictly speaking.
>> 
>> The second sentence describes treesit-search-beginning, not treesit-beginning-of-defun, I think you confused the two?
> 
> Maybe so, but why did you mention treesit-search-beginning in this
> context to begin with?

They are another set of functions that I wonder where to put manual entries in. I’ll probably put them in 35.1 Searching for Strings, after search-forward.

Yuan




^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-21  4:39                               ` Yuan Fu
@ 2022-06-21 10:18                                 ` Eli Zaretskii
  2022-06-22  0:34                                   ` Yuan Fu
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-06-21 10:18 UTC (permalink / raw)
  To: Yuan Fu; +Cc: emacs-devel

> From: Yuan Fu <casouri@gmail.com>
> Date: Mon, 20 Jun 2022 21:39:34 -0700
> Cc: emacs-devel@gnu.org
> 
> >> The second sentence describes treesit-search-beginning, not treesit-beginning-of-defun, I think you confused the two?
> > 
> > Maybe so, but why did you mention treesit-search-beginning in this
> > context to begin with?
> 
> They are another set of functions that I wonder where to put manual entries in. I’ll probably put them in 35.1 Searching for Strings, after search-forward.

Maybe a new section under Searching and Matching?



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-21  4:36                             ` Ihor Radchenko
@ 2022-06-21 12:27                               ` Eli Zaretskii
  2022-06-25  4:47                                 ` Optimizing performance of buffer markers (was: Exposing buffer text modifications to Lisp) Ihor Radchenko
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-06-21 12:27 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: casouri, emacs-devel

> From: Ihor Radchenko <yantar92@gmail.com>
> Cc: casouri@gmail.com,  emacs-devel@gnu.org
> Date: Tue, 21 Jun 2022 12:36:14 +0800
> 
> > OTOH, don't you want to create a Lisp structure to represent AST?  If
> > so, C-level tree will not really help you, would it?
> 
> Clarification: I was asking about C-level trees to store marker list.
> I did not have moving Org AST from Lisp to C-level in mind. We currently
> use built-in Lisp implementation of AVL-tree to search across AST (which
> is not ideal, but good enough for moderately large files).

Ah, okay.  Sorry for my misunderstanding.

Trees could indeed be relevant, but maybe other data structures as
well?  E.g., why not hash tables?  Not that I consider myself an
expert on efficient search algorithms...

> For example, consider
> 
> * TODO headline
> 
> being changed into
> 
> * DONE headline
> 
> with (with-silent-modifications (search-foward "TODO") (replace-match "DONE"))
> or even simply by (replace-match ...) inside indirect buffer created by
> direct call to make-indirect-buffer.
> 
> The AST headline object will need to be updated from
> (headline (... :todo-keyword "TODO" ...))
> to
> (headline (... :todo-keyword "DONE" ...))
> 
> > Emacs inhibits buffer-modification hooks when
> > it is quite sure Lisp programs "don't need to know" about those
> > modifications.  One example you cited where this bites you is use of
> > input methods.  But Quail doesn't inhibit the hooks completely, it
> > only inhibits them enough to pretend that just one character was
> > inserted, when the user might have inserted more.  So why does this
> > get in the way of the Org parser, if the modification hooks are being
> > called "enough"?
> 
> It does not. Given that I implement the suggestion about using
> buffer-size to track "missed" modifications, Quail will not be an issue
> anymore.
> 
> The only potential problem that will remain is the type of buffer
> modifications I described above (shielded by inhibit-modification-hooks
> or by being done inside indirect buffer). If such modifications do not
> change the buffer size (as above), we still get a problem that may
> (although less likely) cause data loss on user side.

I'd consider such Lisp programs buggy.  Actually modifying a buffer
while concealing the entire modification is evil.

> I will submit patches via debbugs in future to make things more
> visible.

TIA.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-21 10:18                                 ` Eli Zaretskii
@ 2022-06-22  0:34                                   ` Yuan Fu
  0 siblings, 0 replies; 187+ messages in thread
From: Yuan Fu @ 2022-06-22  0:34 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: emacs-devel



> On Jun 21, 2022, at 3:18 AM, Eli Zaretskii <eliz@gnu.org> wrote:
> 
>> From: Yuan Fu <casouri@gmail.com>
>> Date: Mon, 20 Jun 2022 21:39:34 -0700
>> Cc: emacs-devel@gnu.org
>> 
>>>> The second sentence describes treesit-search-beginning, not treesit-beginning-of-defun, I think you confused the two?
>>> 
>>> Maybe so, but why did you mention treesit-search-beginning in this
>>> context to begin with?
>> 
>> They are another set of functions that I wonder where to put manual entries in. I’ll probably put them in 35.1 Searching for Strings, after search-forward.
> 
> Maybe a new section under Searching and Matching?

Sure.

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-20 12:32                           ` Eli Zaretskii
  2022-06-20 14:14                             ` Stefan Kangas
  2022-06-21  4:36                             ` Ihor Radchenko
@ 2022-06-22 15:45                             ` Basil L. Contovounesios
  2022-06-22 16:13                               ` Eli Zaretskii
  2 siblings, 1 reply; 187+ messages in thread
From: Basil L. Contovounesios @ 2022-06-22 15:45 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: Ihor Radchenko, casouri, emacs-devel

Eli Zaretskii [2022-06-20 15:32 +0300] wrote:

>> From: Ihor Radchenko <yantar92@gmail.com>
>> Cc: casouri@gmail.com,  emacs-devel@gnu.org
>> Date: Mon, 20 Jun 2022 19:58:31 +0800
>> 
>> Does Emacs C code provide any generic tree structure implementation?
>
> We have interval trees and red-black trees, but they are used for
> specific C-level features, and I wouldn't call them "generic".

Would any of Gnulib's generic container types[0] be appropriate in this
case, and if so, fair game for using in Emacs' sources?

[0]: (info "(gnulib) Container data types")
https://gnu.org/s/gnulib/manual/html_node/Container-data-types.html

Thanks,

-- 
Basil



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-22 15:45                             ` Exposing buffer text modifications to Lisp Basil L. Contovounesios
@ 2022-06-22 16:13                               ` Eli Zaretskii
  2022-06-25  4:54                                 ` Ihor Radchenko
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-06-22 16:13 UTC (permalink / raw)
  To: Basil L. Contovounesios; +Cc: yantar92, casouri, emacs-devel

> From: "Basil L. Contovounesios" <contovob@tcd.ie>
> Cc: Ihor Radchenko <yantar92@gmail.com>,  casouri@gmail.com,
>   emacs-devel@gnu.org
> Date: Wed, 22 Jun 2022 18:45:53 +0300
> 
> Would any of Gnulib's generic container types[0] be appropriate in this
> case, and if so, fair game for using in Emacs' sources?
> 
> [0]: (info "(gnulib) Container data types")
> https://gnu.org/s/gnulib/manual/html_node/Container-data-types.html

Could be.  But I think we need more research before we decide.
Someone™ should study the usage patterns of the markers for
character-to-byte translation, and see which operations should be the
fastest and which could be slower.  Armed with that information, we
could then select the best data structure for the job.

Btw, do we have recipes for measuring the effects of changing the data
structures used for markers?  If we do have such recipes, did someone
try to compare the performance in plain-ASCII Org buffers (where the
conversion is trivial and shouldn't even access the markers) and
non-ASCII buffers?  Inserting a single non-ASCII character somewhere
in an otherwise plain-ASCII buffer should show the effect of many
markers on the likes of CHAR_TO_BYTE.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Optimizing performance of buffer markers (was: Exposing buffer text modifications to Lisp)
  2022-06-21 12:27                               ` Eli Zaretskii
@ 2022-06-25  4:47                                 ` Ihor Radchenko
  2022-06-25  8:29                                   ` Optimizing performance of buffer markers Stefan Monnier
  2022-06-26 10:32                                   ` Robert Pluim
  0 siblings, 2 replies; 187+ messages in thread
From: Ihor Radchenko @ 2022-06-25  4:47 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: casouri, emacs-devel

I think that we settled something workable regarding buffer
modifications. I will try the proposed solutions and see if issues pop
up on Org ML.

Changing subject to reflect the remaining point of the discussion better.

Eli Zaretskii <eliz@gnu.org> writes:

>> Clarification: I was asking about C-level trees to store marker list.
>> I did not have moving Org AST from Lisp to C-level in mind. We currently
>> use built-in Lisp implementation of AVL-tree to search across AST (which
>> is not ideal, but good enough for moderately large files).
>
> Ah, okay.  Sorry for my misunderstanding.
>
> Trees could indeed be relevant, but maybe other data structures as
> well?  E.g., why not hash tables?  Not that I consider myself an
> expert on efficient search algorithms...

AFAIU, buf_bytepos_to_charpos tries to search for the closest marker
near the requested bytepos. It currently does it using the following
heuristics (roughly):

(let ((threshold 50))
 (dolist (marker markers)
  (if (or (< (abs (- marker bytepos)) threshold)
          (< (abs (- nearest_previous_marker bytepos)) threshold))
     (throw 'found marker)
   (cl-incf threshold 50))))

If we store markers in a hash table, there will be no benefit - Hash
table will only allow to find marker at exact position, not nearby.

AFAIK, the most natural data structure to search for data
before/after given key is a binary tree. There are more exotic data
structures, like skip list, but I do not expect skip lists to be
implemented in Emacs C code.

Best,
Ihor



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-22 16:13                               ` Eli Zaretskii
@ 2022-06-25  4:54                                 ` Ihor Radchenko
  2022-06-25  5:46                                   ` Eli Zaretskii
  0 siblings, 1 reply; 187+ messages in thread
From: Ihor Radchenko @ 2022-06-25  4:54 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: Basil L. Contovounesios, casouri, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

> Btw, do we have recipes for measuring the effects of changing the data
> structures used for markers?  If we do have such recipes, did someone
> try to compare the performance in plain-ASCII Org buffers (where the
> conversion is trivial and shouldn't even access the markers) and
> non-ASCII buffers?  Inserting a single non-ASCII character somewhere
> in an otherwise plain-ASCII buffer should show the effect of many
> markers on the likes of CHAR_TO_BYTE.

AFAIK, buf_bytepos_to_charpos should take no time on plain-ASCII buffers
because

  /* If this buffer has as many characters as bytes,
     each character must be one byte.
     This takes care of the case where enable-multibyte-characters is nil.  */
  if (best_above == best_above_byte)
    return bytepos;

The recipe of measuring the effects is in
https://list.orgmode.org/orgmode/scedec$2g0$1@ciao.gmane.io/

That email literally provides Elisp code to run in order to measure the
effect.

Best,
Ihor



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-25  4:54                                 ` Ihor Radchenko
@ 2022-06-25  5:46                                   ` Eli Zaretskii
  2022-06-29 12:24                                     ` Ihor Radchenko
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-06-25  5:46 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: contovob, casouri, emacs-devel

> From: Ihor Radchenko <yantar92@gmail.com>
> Cc: "Basil L. Contovounesios" <contovob@tcd.ie>,  casouri@gmail.com,
>   emacs-devel@gnu.org
> Date: Sat, 25 Jun 2022 12:54:36 +0800
> 
> Eli Zaretskii <eliz@gnu.org> writes:
> 
> > Btw, do we have recipes for measuring the effects of changing the data
> > structures used for markers?  If we do have such recipes, did someone
> > try to compare the performance in plain-ASCII Org buffers (where the
> > conversion is trivial and shouldn't even access the markers) and
> > non-ASCII buffers?  Inserting a single non-ASCII character somewhere
> > in an otherwise plain-ASCII buffer should show the effect of many
> > markers on the likes of CHAR_TO_BYTE.
> 
> AFAIK, buf_bytepos_to_charpos should take no time on plain-ASCII buffers

Yes, that's what I said above.

> The recipe of measuring the effects is in
> https://list.orgmode.org/orgmode/scedec$2g0$1@ciao.gmane.io/
> 
> That email literally provides Elisp code to run in order to measure the
> effect.

Thanks.  However, that Lisp includes Org code, and I wonder why is
that relevant and how it could affect the results.  Using regexp
search shouldn't need any Org code, and is quite simple to write, I
think.  The main problem is to find a recipe that puts many markers in
a buffer.  If this is what the Org code in that recipe is about, then
it's one situation where benchmarking ASCII vs non-ASCII buffers will
help.  Another use case is a buffer with a lot of overlays.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Optimizing performance of buffer markers
  2022-06-25  4:47                                 ` Optimizing performance of buffer markers (was: Exposing buffer text modifications to Lisp) Ihor Radchenko
@ 2022-06-25  8:29                                   ` Stefan Monnier
  2022-06-25  8:44                                     ` Eli Zaretskii
  2022-06-26 10:32                                   ` Robert Pluim
  1 sibling, 1 reply; 187+ messages in thread
From: Stefan Monnier @ 2022-06-25  8:29 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: Eli Zaretskii, casouri, emacs-devel

> AFAIK, the most natural data structure to search for data
> before/after given key is a binary tree. There are more exotic data
> structures, like skip list, but I do not expect skip lists to be
> implemented in Emacs C code.

BTW, most markers are actually part of overlays.  And Andreas Politz
implemented an AA-tree based representation of overlays for Emacs (see
the branch `feature/noverlay`).

So if you have performance problems due to overlays, you might want to
check the branch, make sure it solves the problem, and see if you can
get it merged once and for all into `master`.


        Stefan




^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Optimizing performance of buffer markers
  2022-06-25  8:29                                   ` Optimizing performance of buffer markers Stefan Monnier
@ 2022-06-25  8:44                                     ` Eli Zaretskii
  2022-06-25  9:07                                       ` Stefan Monnier
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-06-25  8:44 UTC (permalink / raw)
  To: Stefan Monnier; +Cc: yantar92, casouri, emacs-devel

> From: Stefan Monnier <monnier@iro.umontreal.ca>
> Cc: Eli Zaretskii <eliz@gnu.org>,  casouri@gmail.com,  emacs-devel@gnu.org
> Date: Sat, 25 Jun 2022 04:29:08 -0400
> 
> > AFAIK, the most natural data structure to search for data
> > before/after given key is a binary tree. There are more exotic data
> > structures, like skip list, but I do not expect skip lists to be
> > implemented in Emacs C code.
> 
> BTW, most markers are actually part of overlays.  And Andreas Politz
> implemented an AA-tree based representation of overlays for Emacs (see
> the branch `feature/noverlay`).
> 
> So if you have performance problems due to overlays, you might want to
> check the branch, make sure it solves the problem, and see if you can
> get it merged once and for all into `master`.

Landing that branch is indeed a very Good Thing, but AFAIU this
discussion revealed that Org adds quite a few markers of its own when
it parses the buffer, because it wants to track the positions of some
syntactic elements of the buffer.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Optimizing performance of buffer markers
  2022-06-25  8:44                                     ` Eli Zaretskii
@ 2022-06-25  9:07                                       ` Stefan Monnier
  2022-06-25  9:20                                         ` Eli Zaretskii
  2022-06-25  9:47                                         ` Ihor Radchenko
  0 siblings, 2 replies; 187+ messages in thread
From: Stefan Monnier @ 2022-06-25  9:07 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: yantar92, casouri, emacs-devel

> Landing that branch is indeed a very Good Thing, but AFAIU this
> discussion revealed that Org adds quite a few markers of its own when
> it parses the buffer, because it wants to track the positions of some
> syntactic elements of the buffer.

That's quite unusual, tho, and I suspect that Org's code could be
changed to use overlays instead (after all, we could define an
"efficient marker" as (make-overlay POS POS)).


        Stefan




^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Optimizing performance of buffer markers
  2022-06-25  9:07                                       ` Stefan Monnier
@ 2022-06-25  9:20                                         ` Eli Zaretskii
  2022-06-25  9:27                                           ` Stefan Monnier
  2022-06-25  9:47                                         ` Ihor Radchenko
  1 sibling, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-06-25  9:20 UTC (permalink / raw)
  To: Stefan Monnier; +Cc: yantar92, casouri, emacs-devel

> From: Stefan Monnier <monnier@iro.umontreal.ca>
> Cc: yantar92@gmail.com,  casouri@gmail.com,  emacs-devel@gnu.org
> Date: Sat, 25 Jun 2022 05:07:59 -0400
> 
> > Landing that branch is indeed a very Good Thing, but AFAIU this
> > discussion revealed that Org adds quite a few markers of its own when
> > it parses the buffer, because it wants to track the positions of some
> > syntactic elements of the buffer.
> 
> That's quite unusual, tho, and I suspect that Org's code could be
> changed to use overlays instead (after all, we could define an
> "efficient marker" as (make-overlay POS POS)).

If that is going to be our advice, it would make sense to reimplement
markers as a kind of overlays, so that Lisp programs shouldn't bother
with this trick.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Optimizing performance of buffer markers
  2022-06-25  9:20                                         ` Eli Zaretskii
@ 2022-06-25  9:27                                           ` Stefan Monnier
  0 siblings, 0 replies; 187+ messages in thread
From: Stefan Monnier @ 2022-06-25  9:27 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: yantar92, casouri, emacs-devel

> If that is going to be our advice,

That wouldn't be a general advice, but rather for specific cases where
performance proves problematic.  In any case, this is quite
hypothetical: we don't know yet whether that branch would help Org's
specific use-case.  And the code needs some work before we can merge it
(it's about 5 years old).

> it would make sense to reimplement markers as a kind of overlays, so
> that Lisp programs shouldn't bother with this trick.

That's a possibility, but this would require further changes for the
charpos<->bytepos conversion.


        Stefan




^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Optimizing performance of buffer markers
  2022-06-25  9:07                                       ` Stefan Monnier
  2022-06-25  9:20                                         ` Eli Zaretskii
@ 2022-06-25  9:47                                         ` Ihor Radchenko
  2022-06-25  9:53                                           ` Stefan Monnier
  1 sibling, 1 reply; 187+ messages in thread
From: Ihor Radchenko @ 2022-06-25  9:47 UTC (permalink / raw)
  To: Stefan Monnier; +Cc: Eli Zaretskii, casouri, emacs-devel

Stefan Monnier <monnier@iro.umontreal.ca> writes:

>> Landing that branch is indeed a very Good Thing, but AFAIU this
>> discussion revealed that Org adds quite a few markers of its own when
>> it parses the buffer, because it wants to track the positions of some
>> syntactic elements of the buffer.
>
> That's quite unusual, tho, and I suspect that Org's code could be
> changed to use overlays instead (after all, we could define an
> "efficient marker" as (make-overlay POS POS)).

Using overlays will put extra load on display engine, on top of the
performance related to pure markers.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Optimizing performance of buffer markers
  2022-06-25  9:47                                         ` Ihor Radchenko
@ 2022-06-25  9:53                                           ` Stefan Monnier
  0 siblings, 0 replies; 187+ messages in thread
From: Stefan Monnier @ 2022-06-25  9:53 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: Eli Zaretskii, casouri, emacs-devel

> Using overlays will put extra load on display engine, on top of the
> performance related to pure markers.

Could be, yes (but the tree storage of overlays should compensate to
some extent).  Hard to tell how significant that would be without trying
it out.


        Stefan




^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Optimizing performance of buffer markers
  2022-06-25  4:47                                 ` Optimizing performance of buffer markers (was: Exposing buffer text modifications to Lisp) Ihor Radchenko
  2022-06-25  8:29                                   ` Optimizing performance of buffer markers Stefan Monnier
@ 2022-06-26 10:32                                   ` Robert Pluim
  1 sibling, 0 replies; 187+ messages in thread
From: Robert Pluim @ 2022-06-26 10:32 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: Eli Zaretskii, casouri, emacs-devel

>>>>> On Sat, 25 Jun 2022 12:47:38 +0800, Ihor Radchenko <yantar92@gmail.com> said:
    Ihor> AFAIU, buf_bytepos_to_charpos tries to search for the closest marker
    Ihor> near the requested bytepos. It currently does it using the following
    Ihor> heuristics (roughly):

    Ihor> (let ((threshold 50))
    Ihor>  (dolist (marker markers)
    Ihor>   (if (or (< (abs (- marker bytepos)) threshold)
    Ihor>           (< (abs (- nearest_previous_marker bytepos)) threshold))
    Ihor>      (throw 'found marker)
    Ihor>    (cl-incf threshold 50))))

    Ihor> If we store markers in a hash table, there will be no benefit - Hash
    Ihor> table will only allow to find marker at exact position, not nearby.

We could use (charpos / 50) as the key to the hash table, and then
store a list of markers as the value(s). Of course you'd have to
update the hash table every time a marker is added, deleted, or
changes charpos, so I donʼt know if it would be a win.

Robert
-- 



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-25  5:46                                   ` Eli Zaretskii
@ 2022-06-29 12:24                                     ` Ihor Radchenko
  0 siblings, 0 replies; 187+ messages in thread
From: Ihor Radchenko @ 2022-06-29 12:24 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: contovob, casouri, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

> Thanks.  However, that Lisp includes Org code, and I wonder why is
> that relevant and how it could affect the results.  Using regexp
> search shouldn't need any Org code, and is quite simple to write, I
> think.  The main problem is to find a recipe that puts many markers in
> a buffer.  If this is what the Org code in that recipe is about, then
> it's one situation where benchmarking ASCII vs non-ASCII buffers will
> help.  Another use case is a buffer with a lot of overlays.

It is harder than just putting many markers.
I was unable to create a simpler reproducer.

Best,
Ihor



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-30 14:29     ` Abin Simon
@ 2022-06-30 14:37       ` Yoav Marco
  0 siblings, 0 replies; 187+ messages in thread
From: Yoav Marco @ 2022-06-30 14:37 UTC (permalink / raw)
  To: Abin Simon; +Cc: Abin Simon, casouri, eliz, theo, monnier, dancol, emacs-devel


Abin Simon <mail@meain.io> writes:

>> I see you wrote a go program to pre-process #make-range!. I also had to do
>> something similar in
>> https://github.com/emacs-tree-sitter/tree-sitter-langs/pull/99
>> due to predicate incompatibility, though I do it in Elisp right after
>> reading the query file. So I guess it's a valid solution too.
>
> Ahh, that makes sense. I just went with go just because I was bit more
> confortable writing the "parser" in go.

Ah, well I cheated :) I ran a regexp on the query to replace # with \#,
and then the whole query is a valid elisp sexp, so I read it to a list
with `read' and recursively parsed *that*. Makes for very pretty code
IMO.

  Yoav



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-30 11:21   ` Yoav Marco
@ 2022-06-30 14:29     ` Abin Simon
  2022-06-30 14:37       ` Yoav Marco
  0 siblings, 1 reply; 187+ messages in thread
From: Abin Simon @ 2022-06-30 14:29 UTC (permalink / raw)
  To: Yoav Marco, Abin Simon; +Cc: casouri, eliz, theo, monnier, dancol, emacs-devel

Yoav Marco <yoavm448@gmail.com> writes:

> I dug a little through the neovim code. #make-range! is implemented in
> nvim-treesitter, not mainline neovim. And they don't create a new node
> for the range, just compute the range itself.
>
> I think the best course of action is to support elisp predicates in some
> way, and let users post-process the output of treesit-query-capture if
> they added queries that return non-boolean results.

Yeah, I think this would probably work out pretty well, at least for my
usecase.

> I see you wrote a go program to pre-process #make-range!. I also had to do
> something similar in
> https://github.com/emacs-tree-sitter/tree-sitter-langs/pull/99
> due to predicate incompatibility, though I do it in Elisp right after
> reading the query file. So I guess it's a valid solution too.

Ahh, that makes sense. I just went with go just because I was bit more
confortable writing the "parser" in go.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-29 17:43 ` Yoav Marco
@ 2022-06-30 11:21   ` Yoav Marco
  2022-06-30 14:29     ` Abin Simon
  0 siblings, 1 reply; 187+ messages in thread
From: Yoav Marco @ 2022-06-30 11:21 UTC (permalink / raw)
  To: Abin Simon; +Cc: casouri, eliz, theo, monnier, dancol, emacs-devel

Yoav Marco <yoavm448@gmail.com> writes:

> Okay, so the problem here is that neovim supports making up arbitrary
> captures as a range over other captures: (#make-range! "c" @a @b) means
> "make a new capture @c that spans from @a to @b". It helps when you e.g
> want to make a capture spanning only two children of a node and not the
> other children.
>
> The problem is, making up new captures in Lisp isn't trivial, it would
> need special support form the C side.
>
> Could we support this?
>
> 1. We could allow a special case where if Lisp predicate return a list
>    (name beg end) that would have the same effect as "make new capture
>    @c with range @a, @b".
>
> 2. But captures are returned from treesit-query-capture as pairs of
>    (capture-name . node), and we can't just make up a node with
>    arbitrary range.
>
> 3. We could report non-boolean capture results by just appending that
>    result to the list of pairs, but that just adds complexity to users
>    of treesit-query-capture. And it doesn't support the simpler use case
>    of 1, where it's just reported as a normal capture.


I dug a little through the neovim code. #make-range! is implemented in
nvim-treesitter, not mainline neovim. And they don't create a new node
for the range, just compute the range itself.

I think the best course of action is to support elisp predicates in some
way, and let users post-process the output of treesit-query-capture if
they added queries that return non-boolean results.

> Abin Simon <abinsimon10@gmail.com> writes:
>
>> I ran into similar issues in the meain/evil-textobj-tree-sitter and had
>> to write scritps to convert them to something that works in emacs.

I see you wrote a go program to pre-process #make-range!. I also had to do
something similar in
https://github.com/emacs-tree-sitter/tree-sitter-langs/pull/99
due to predicate incompatibility, though I do it in Elisp right after
reading the query file. So I guess it's a valid solution too.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-29 16:51 Abin Simon
@ 2022-06-29 17:43 ` Yoav Marco
  2022-06-30 11:21   ` Yoav Marco
  0 siblings, 1 reply; 187+ messages in thread
From: Yoav Marco @ 2022-06-29 17:43 UTC (permalink / raw)
  To: Abin Simon; +Cc: casouri, eliz, theo, monnier, dancol, emacs-devel

Cool that you're chipping in, we need feedback from the community and
specifically package writers.

Abin Simon <abinsimon10@gmail.com> writes:

> Yoav Marco <yoavm448@gmail.com> writes:
>
>> I could try to make a self-updating repo or something with CI -
>> https://github.com/meain/evil-textobj-tree-sitter for example pulls
>> highlights.scm changes from nvim's repo weeky.
>
> Just wanted to give a heads up that neovim has non standard items in
> their queries. For example you will find things like `vim-match` and
> `lua-match` in the queries.

I guess we could support a "lisp-match" predicate, that would run a
function and match if it returns non-nil. Or just try and look up
non-existing predicates as Lisp functions. Would that be useful?

> https://github.com/nvim-treesitter/nvim-treesitter/blob/989c75046c46d2ed96bb65c5badd6b8f785e7f09/queries/go/highlights.scm#L19
>
>
> I ran into similar issues in the meain/evil-textobj-tree-sitter and had
> to write scritps to convert them to something that works in emacs.
>
> ref: https://github.com/meain/evil-textobj-tree-sitter/issues/33

Okay, so the problem here is that neovim supports making up arbitrary
captures as a range over other captures: (#make-range! "c" @a @b) means
"make a new capture @c that spans from @a to @b". It helps when you e.g
want to make a capture spanning only two children of a node and not the
other children.

The problem is, making up new captures in Lisp isn't trivial, it would
need special support form the C side.

Could we support this?

1. We could allow a special case where if Lisp predicate return a list
   (name beg end) that would have the same effect as "make new capture
   @c with range @a, @b".

2. But captures are returned from treesit-query-capture as pairs of
   (capture-name . node), and we can't just make up a node with
   arbitrary range.

3. We could report non-boolean capture results by just appending that
   result to the list of pairs, but that just adds complexity to users
   of treesit-query-capture. And it doesn't support the simpler use case
   of 1, where it's just reported as a normal capture.

Yuan, any thoughts on capture extensibility?

  Yoav



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
@ 2022-06-29 16:51 Abin Simon
  2022-06-29 17:43 ` Yoav Marco
  0 siblings, 1 reply; 187+ messages in thread
From: Abin Simon @ 2022-06-29 16:51 UTC (permalink / raw)
  To: yoavm448; +Cc: casouri, eliz, theo, monnier, dancol, emacs-devel

[-- Attachment #1: Type: text/plain, Size: 727 bytes --]

Yoav Marco <yoavm448@gmail.com> writes:

> I could try to make a self-updating repo or something with CI -
> https://github.com/meain/evil-textobj-tree-sitter for example pulls
> highlights.scm changes from nvim's repo weeky.

Just wanted to give a heads up that neovim has non standard items in
their queries. For example you will find things like `vim-match` and
`lua-match` in the queries.

https://github.com/nvim-treesitter/nvim-treesitter/blob/989c75046c46d2ed96bb65c5badd6b8f785e7f09/queries/go/highlights.scm#L19

I ran into similar issues in the meain/evil-textobj-tree-sitter and had
to write scritps to convert them to something that works in emacs.

ref: https://github.com/meain/evil-textobj-tree-sitter/issues/33

[-- Attachment #2: Type: text/html, Size: 1140 bytes --]

^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-28 19:35 ` Yoav Marco
@ 2022-06-29 15:35   ` Yuan Fu
  0 siblings, 0 replies; 187+ messages in thread
From: Yuan Fu @ 2022-06-29 15:35 UTC (permalink / raw)
  To: Yoav Marco
  Cc: Eli Zaretskii, Theodor Thornhill, Stefan Monnier,
	Daniel Colascione, emacs-devel



> On Jun 28, 2022, at 12:35 PM, Yoav Marco <yoavm448@gmail.com> wrote:
> 
> Whoops, my mail turned out garbaled, sorry.
> 
> Corrected version:
> 
> Yuan Fu <casouri@gmail.com> writes:
>>>> Also I think it makes more sense if you just fork it rather than making a PR.
>>> 
>>> Yeah, I don't really expect it to be merged.
>>> 
>>> My reason for the PR is for it to be more of a talking point about
>>> collaboration between treesit and tree-sitter-langs. I used
>>> elisp-tree-sitter before trying the feature/tree-sitter branch, and I
>>> really like the richness of its highlighting (which comes from the
>>> highlights.scm files).
>> 
>> Do you already have the highlighting working for treesit?
> 
> I only checked a few, but yeah, and the syntax highlighting is very pretty.

Cool.

> 
>> If so, maybe you can packages it in a separate package and publish it,
>> it would be a nice demonstration of treesit features.
> 
> I'm not comfortable with doing that, the query patterns update
> frequently upstream (though ubolonton has been busy lately) and all I'm
> doing is post-process them a little.
> 
> I could try to make a self-updating repo or something with CI -
> https://github.com/meain/evil-textobj-tree-sitter for example pulls
> highlights.scm changes from nvim's repo weeky.
> 
> Is that worth it? I thought people could just as easily use my PR
> branch of tree-sitter-langs. That also takes care of fetching grammars.

That would be a lot of work. Using your PR repo is fine, I think. I can just refer to your fork.

Thanks,
Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-28 16:08 Yoav Marco
@ 2022-06-28 19:35 ` Yoav Marco
  2022-06-29 15:35   ` Yuan Fu
  0 siblings, 1 reply; 187+ messages in thread
From: Yoav Marco @ 2022-06-28 19:35 UTC (permalink / raw)
  To: Yuan Fu
  Cc: Eli Zaretskii, Theodor Thornhill, Stefan Monnier,
	Daniel Colascione, emacs-devel

Whoops, my mail turned out garbaled, sorry.

Corrected version:

Yuan Fu <casouri@gmail.com> writes:
>>> Also I think it makes more sense if you just fork it rather than making a PR.
>>
>> Yeah, I don't really expect it to be merged.
>>
>> My reason for the PR is for it to be more of a talking point about
>> collaboration between treesit and tree-sitter-langs. I used
>> elisp-tree-sitter before trying the feature/tree-sitter branch, and I
>> really like the richness of its highlighting (which comes from the
>> highlights.scm files).
>
> Do you already have the highlighting working for treesit?

I only checked a few, but yeah, and the syntax highlighting is very pretty.

> If so, maybe you can packages it in a separate package and publish it,
> it would be a nice demonstration of treesit features.

I'm not comfortable with doing that, the query patterns update
frequently upstream (though ubolonton has been busy lately) and all I'm
doing is post-process them a little.

I could try to make a self-updating repo or something with CI -
https://github.com/meain/evil-textobj-tree-sitter for example pulls
highlights.scm changes from nvim's repo weeky.

Is that worth it? I thought people could just as easily use my PR
branch of tree-sitter-langs. That also takes care of fetching grammars.


  Sorry for the late response,
  Yoav



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
@ 2022-06-28 16:08 Yoav Marco
  2022-06-28 19:35 ` Yoav Marco
  0 siblings, 1 reply; 187+ messages in thread
From: Yoav Marco @ 2022-06-28 16:08 UTC (permalink / raw)
  To: Yuan Fu
  Cc: Eli Zaretskii, Theodor Thornhill, Stefan Monnier,
	Daniel Colascione, emacs-devel

References: <5bada349-2f43-4325-b696-70918584cd3d@email.android.com> <83mtfsuluo.fsf@gnu.org> <87sfpjhm33.fsf@thornhill.no> <83a6brufe5.fsf@gnu.org> <87pmkmhp8i.fsf@thornhill.no> <83v8ueuc7i.fsf@gnu.org> <73DE25BA-5EEF-4497-8F98-8C5F20853A61@gmail.com> <87v8uewfuq.fsf@thornhill.no> <B227CEAC-B3E9-45B2-B859-0C411CCECF3F@gmail.com> <87mtfkbt9n.fsf@thornhill.no> <1179E1EC-90EF-4989-BE1D-115498F77F60@gmail.com> <87k0ajygry.fsf@thornhill.no> <6EF70929-5759-4F1A-B878-0C1660FB6831@gmail.com> <87leuy5z46.fsf@thornhill.no> <2E68780C-9923-411E-A5DF-B1A54E2EC38B@gmail.com> <83k09fq0am.fsf@gnu.org> <38C272F6-828C-4478-9D90-326AF14D0C94@gmail.com> <87h74j2lin.fsf@gmail.com> <4C257709-0E3E-46A7-8CEB-569001EEE31D@gmail.com> <87edzm2vk8.fsf@gmail.com> <1CEC92F7-3CB9-48DA-887C-43CD4C58E406@gmail.com>
User-agent: mu4e 1.6.3; emacs 29.0.50
In-reply-to: <1CEC92F7-3CB9-48DA-887C-43CD4C58E406@gmail.com>

https://github.com/meain/evil-textobj-tree-sitter for example pulls
Yuan Fu <casouri@gmail.com> writes:
>>> Also I think it makes more sense if you just fork it rather than making a PR.
>>
>> Yeah, I don't really expect it to be merged.
>>
>> My reason for the PR is for it to be more of a talking point about
>> collaboration between treesit and tree-sitter-langs. I used
>> elisp-tree-sitter before trying the feature/tree-sitter branch, and I
>> really like the richness of its highlighting (which comes from the
>> highlights.scm files).
>
> Do you already have the highlighting working for treesit?

I only checked a few, but yeah, and the syntax highlighting is very pretty.

> If so, maybe you can packages it in a separate package and publish it,
> it would be a nice demonstration of treesit features.

I'm not comfortable with doing that, the query patterns update
frequently upstream (though ubolonton has been busy lately) and all I'm
doing is post-process them a little.

I could try to make a self-updating repo or something with CI -
highlights.scm changes from nvim's repo weeky.

Is that worth it? I thought people could just as easily use my PR
branch of tree-sitter-langs. That also takes care of fetching grammars.


  Sorry for the late response,
  Yoav



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-20 14:20                                                                       ` Daniel Martín
@ 2022-06-20 20:03                                                                         ` Yuan Fu
  0 siblings, 0 replies; 187+ messages in thread
From: Yuan Fu @ 2022-06-20 20:03 UTC (permalink / raw)
  To: Daniel Martín
  Cc: Eli Zaretskii, Theodor Thornhill, Stefan Monnier, Emacs Devel, dancol



> On Jun 20, 2022, at 7:20 AM, Daniel Martín <mardani29@yahoo.es> wrote:
> 
> Yuan Fu <casouri@gmail.com> writes:
> 
>>> 
>>>> [1]: https://archive.casouri.cat/note/2021/emacs-tree-sitter/index.html
>>> 
>>> This seems to indicate we should provide more higher-level features to
>>> make use of tree-sitter related features easier. The instruction
>>> there sound like a typical user will need to do a lot before he/she
>>> can see any practical benefit from this build.
>>> 
>>> Maybe updating it will solve some of that.
>> 
>> I think tree-sitter needs other people to use it to write interesting
>> things to be useful for end users. Before someone adapt some major
>> modes with tree-sitter and push to feature/tree-sitter, this build
>> would probably remain uninteresting to end users.
>> 
> 
> I suggest "advertising" the branch in other Emacs communities like
> Reddit, etc. I know there's people that have already created major
> modes that use the currently available Tree-Sitter integration as an
> Emacs module. They can provide good feedback about what's expected from
> the API, or people might "port" the modes to use the Core Tree-Sitter
> API.

Good idea! I had done it last September where I advertised that article I linked on reddit. I’m planning to update that article and advertise again soon.

Yuan




^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-18  0:20                                                                     ` Yuan Fu
  2022-06-18  6:23                                                                       ` Eli Zaretskii
@ 2022-06-20 14:20                                                                       ` Daniel Martín
  2022-06-20 20:03                                                                         ` Yuan Fu
  1 sibling, 1 reply; 187+ messages in thread
From: Daniel Martín @ 2022-06-20 14:20 UTC (permalink / raw)
  To: Yuan Fu
  Cc: Eli Zaretskii, Theodor Thornhill, Stefan Monnier, Emacs Devel, dancol

Yuan Fu <casouri@gmail.com> writes:

>> 
>>> [1]: https://archive.casouri.cat/note/2021/emacs-tree-sitter/index.html
>> 
>> This seems to indicate we should provide more higher-level features to
>> make use of tree-sitter related features easier. The instruction
>> there sound like a typical user will need to do a lot before he/she
>> can see any practical benefit from this build.
>> 
>> Maybe updating it will solve some of that.
>
> I think tree-sitter needs other people to use it to write interesting
> things to be useful for end users. Before someone adapt some major
> modes with tree-sitter and push to feature/tree-sitter, this build
> would probably remain uninteresting to end users.
>

I suggest "advertising" the branch in other Emacs communities like
Reddit, etc.  I know there's people that have already created major
modes that use the currently available Tree-Sitter integration as an
Emacs module.  They can provide good feedback about what's expected from
the API, or people might "port" the modes to use the Core Tree-Sitter
API.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-18  8:15                                                                       ` Yoav Marco
@ 2022-06-18 20:11                                                                         ` Yuan Fu
  0 siblings, 0 replies; 187+ messages in thread
From: Yuan Fu @ 2022-06-18 20:11 UTC (permalink / raw)
  To: Yoav Marco
  Cc: Eli Zaretskii, Theodor Thornhill, Stefan Monnier,
	Daniel Colascione, emacs-devel



> On Jun 18, 2022, at 1:15 AM, Yoav Marco <yoavm448@gmail.com> wrote:
> 
> Yuan Fu <casouri@gmail.com> writes:
>>> That reminds me -- Yuan, have you seen my PR to tree-sitter-langs, Hacky
>>> support for treesit in Emacs core [2]?
>>> 
>>> It uses tree-sitter-langs' groundwork for fetching grammars and
>>> packaging highlighting queries, but uses treesit instead of the
>>> tree-sitter dynamic module.
>>> 
>>> Enabling highlighting is just M-x treesit-langs-hl-mode in major-modes
>>> that tree-sitter-langs supports.
>>> 
>>> [2]: https://github.com/emacs-tree-sitter/tree-sitter-langs/pull/99
>> 
>> Ah yes, I’ve seen it. I think the part that automatically downloads and builds
>> language definitions is very useful. It cannot be in Emacs core because we
>> cannot distribute language definitions, but it could be a very useful ELPA or
>> MELPA package.
> 
> Yeah, I agree. tree-sitter-langs makes it very easy to use
> elisp-tree-sitter and now also the newer treesit.
> 
>> The part that automatically generate highlighting is also useful,
>> but I’m not sure how would we use it.
> 
> A little neatpick -- it doesn't really 'generate', just converts
> tree-sitter-langs' hand-crafted highlights.scm query files to be usable
> by treesit too.
> 
>> We probably don’t want to add a tree-sitterify-mode that just enables
>> tree-sitter highlight in a mode—I prefer that we change each major
>> mode to use tree-sitter features.
> 
> I agree, though since grammars aren't packaged with Emacs, major-modes
> that *are* packaged with Emacs would need to only use treesit when the
> grammars are avaliable. Or do we expect grammars to be a dependency when
> users are installing Emacs?

No, we don’t distribute language definitions, built-in major-modes should support both tree-sitter and non-tree-sitter.

> 
>> Also I think it makes more sense if you just fork it rather than making a PR.
> 
> Yeah, I don't really expect it to be merged.
> 
> My reason for the PR is for it to be more of a talking point about
> collaboration between treesit and tree-sitter-langs. I used
> elisp-tree-sitter before trying the feature/tree-sitter branch, and I
> really like the richness of its highlighting (which comes from the
> highlights.scm files).

Do you already have the highlighting working for treesit? If so, maybe you can packages it in a separate package and publish it, it would be a nice demonstration of treesit features.

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-18  0:35                                                                     ` Yuan Fu
@ 2022-06-18  8:15                                                                       ` Yoav Marco
  2022-06-18 20:11                                                                         ` Yuan Fu
  0 siblings, 1 reply; 187+ messages in thread
From: Yoav Marco @ 2022-06-18  8:15 UTC (permalink / raw)
  To: Yuan Fu
  Cc: Eli Zaretskii, Theodor Thornhill, Stefan Monnier,
	Daniel Colascione, emacs-devel

Yuan Fu <casouri@gmail.com> writes:
>> That reminds me -- Yuan, have you seen my PR to tree-sitter-langs, Hacky
>> support for treesit in Emacs core [2]?
>>
>> It uses tree-sitter-langs' groundwork for fetching grammars and
>> packaging highlighting queries, but uses treesit instead of the
>> tree-sitter dynamic module.
>>
>> Enabling highlighting is just M-x treesit-langs-hl-mode in major-modes
>> that tree-sitter-langs supports.
>>
>> [2]: https://github.com/emacs-tree-sitter/tree-sitter-langs/pull/99
>
> Ah yes, I’ve seen it. I think the part that automatically downloads and builds
> language definitions is very useful. It cannot be in Emacs core because we
> cannot distribute language definitions, but it could be a very useful ELPA or
> MELPA package.

Yeah, I agree. tree-sitter-langs makes it very easy to use
elisp-tree-sitter and now also the newer treesit.

> The part that automatically generate highlighting is also useful,
> but I’m not sure how would we use it.

A little neatpick -- it doesn't really 'generate', just converts
tree-sitter-langs' hand-crafted highlights.scm query files to be usable
by treesit too.

> We probably don’t want to add a tree-sitterify-mode that just enables
> tree-sitter highlight in a mode—I prefer that we change each major
> mode to use tree-sitter features.

I agree, though since grammars aren't packaged with Emacs, major-modes
that *are* packaged with Emacs would need to only use treesit when the
grammars are avaliable. Or do we expect grammars to be a dependency when
users are installing Emacs?

> Also I think it makes more sense if you just fork it rather than making a PR.

Yeah, I don't really expect it to be merged.

My reason for the PR is for it to be more of a talking point about
collaboration between treesit and tree-sitter-langs. I used
elisp-tree-sitter before trying the feature/tree-sitter branch, and I
really like the richness of its highlighting (which comes from the
highlights.scm files).

  Yoav



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-18  0:20                                                                     ` Yuan Fu
@ 2022-06-18  6:23                                                                       ` Eli Zaretskii
  2022-06-20 14:20                                                                       ` Daniel Martín
  1 sibling, 0 replies; 187+ messages in thread
From: Eli Zaretskii @ 2022-06-18  6:23 UTC (permalink / raw)
  To: Yuan Fu; +Cc: theo, monnier, emacs-devel, dancol

> From: Yuan Fu <casouri@gmail.com>
> Date: Fri, 17 Jun 2022 17:20:57 -0700
> Cc: Theodor Thornhill <theo@thornhill.no>,
>  Stefan Monnier <monnier@iro.umontreal.ca>,
>  Emacs Devel <emacs-devel@gnu.org>,
>  dancol@dancol.org
> 
> > 
> >> [1]: https://archive.casouri.cat/note/2021/emacs-tree-sitter/index.html
> > 
> > This seems to indicate we should provide more higher-level features to
> > make use of tree-sitter related features easier. The instruction
> > there sound like a typical user will need to do a lot before he/she
> > can see any practical benefit from this build.
> > 
> > Maybe updating it will solve some of that.
> 
> I think tree-sitter needs other people to use it to write interesting things to be useful for end users. Before someone adapt some major modes with tree-sitter and push to feature/tree-sitter, this build would probably remain uninteresting to end users. 

That was the essence of what I wrote, yes.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-17 18:12                                                                   ` Yoav Marco
@ 2022-06-18  0:35                                                                     ` Yuan Fu
  2022-06-18  8:15                                                                       ` Yoav Marco
  0 siblings, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-06-18  0:35 UTC (permalink / raw)
  To: Yoav Marco
  Cc: Eli Zaretskii, Theodor Thornhill, Stefan Monnier,
	Daniel Colascione, emacs-devel

> 
> That reminds me -- Yuan, have you seen my PR to tree-sitter-langs, Hacky
> support for treesit in Emacs core [2]?
> 
> It uses tree-sitter-langs' groundwork for fetching grammars and
> packaging highlighting queries, but uses treesit instead of the
> tree-sitter dynamic module.
> 
> Enabling highlighting is just M-x treesit-langs-hl-mode in major-modes
> that tree-sitter-langs supports.
> 
> [2]: https://github.com/emacs-tree-sitter/tree-sitter-langs/pull/99

Ah yes, I’ve seen it. I think the part that automatically downloads and builds language definitions is very useful. It cannot be in Emacs core because we cannot distribute language definitions, but it could be a very useful ELPA or MELPA package. The part that automatically generate highlighting is also useful, but I’m not sure how would we use it. We probably don’t want to add a tree-sitterify-mode that just enables tree-sitter highlight in a mode—I prefer that we change each major mode to use tree-sitter features.

Also I think it makes more sense if you just fork it rather than making a PR.

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-17 10:42                                                                   ` Eli Zaretskii
@ 2022-06-18  0:20                                                                     ` Yuan Fu
  2022-06-18  6:23                                                                       ` Eli Zaretskii
  2022-06-20 14:20                                                                       ` Daniel Martín
  0 siblings, 2 replies; 187+ messages in thread
From: Yuan Fu @ 2022-06-18  0:20 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: Theodor Thornhill, Stefan Monnier, Emacs Devel, dancol

> 
>> [1]: https://archive.casouri.cat/note/2021/emacs-tree-sitter/index.html
> 
> This seems to indicate we should provide more higher-level features to
> make use of tree-sitter related features easier. The instruction
> there sound like a typical user will need to do a lot before he/she
> can see any practical benefit from this build.
> 
> Maybe updating it will solve some of that.

I think tree-sitter needs other people to use it to write interesting things to be useful for end users. Before someone adapt some major modes with tree-sitter and push to feature/tree-sitter, this build would probably remain uninteresting to end users. 

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-17  7:32                                                                 ` Yuan Fu
  2022-06-17 10:42                                                                   ` Eli Zaretskii
@ 2022-06-17 18:12                                                                   ` Yoav Marco
  2022-06-18  0:35                                                                     ` Yuan Fu
  1 sibling, 1 reply; 187+ messages in thread
From: Yoav Marco @ 2022-06-17 18:12 UTC (permalink / raw)
  To: Yuan Fu; +Cc: Eli Zaretskii, Theodor Thornhill, monnier, dancol, emacs-devel


Yuan Fu <casouri@gmail.com> writes:

>> On Jun 16, 2022, at 11:19 PM, Eli Zaretskii <eliz@gnu.org> wrote:
>> [...]
>> I mean, what should a user do, after
>> building the branch, to start using the features backed by
>> tree-sitter?  I presume the list should include downloading parsers
>> (any recommendations for the relevant sites?), compiling them (or
>> maybe there are sites with precompiled ones?), then setting some
>> variables or data structures in Emacs or invoking some commands/modes?
>>
>> I think something like that will be useful for more people to try the
>> branch, provide feedback, and facilitate making its more stable.
>
> I do have an article that covers many of the topics you mentioned [1]. But it
> lacks “hands-on” details, and it is a bit out-dated (repository moved, we
> changed the prefix, etc). I can update it with, say, a starter guide on
> implementing a minimal tree-sitter C major mode. I’ll post back once its done.
>
> [1]: https://archive.casouri.cat/note/2021/emacs-tree-sitter/index.html
>
> Yuan

That reminds me -- Yuan, have you seen my PR to tree-sitter-langs, Hacky
support for treesit in Emacs core [2]?

It uses tree-sitter-langs' groundwork for fetching grammars and
packaging highlighting queries, but uses treesit instead of the
tree-sitter dynamic module.

Enabling highlighting is just M-x treesit-langs-hl-mode in major-modes
that tree-sitter-langs supports.

[2]: https://github.com/emacs-tree-sitter/tree-sitter-langs/pull/99



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-17  7:32                                                                 ` Yuan Fu
@ 2022-06-17 10:42                                                                   ` Eli Zaretskii
  2022-06-18  0:20                                                                     ` Yuan Fu
  2022-06-17 18:12                                                                   ` Yoav Marco
  1 sibling, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-06-17 10:42 UTC (permalink / raw)
  To: Yuan Fu; +Cc: theo, monnier, emacs-devel, dancol

> From: Yuan Fu <casouri@gmail.com>
> Date: Fri, 17 Jun 2022 00:32:34 -0700
> Cc: Theodor Thornhill <theo@thornhill.no>,
>  monnier@iro.umontreal.ca,
>  emacs-devel@gnu.org,
>  dancol@dancol.org
> 
> 
> 
> > On Jun 16, 2022, at 11:19 PM, Eli Zaretskii <eliz@gnu.org> wrote:
> > 
> >> From: Yuan Fu <casouri@gmail.com>
> >> Date: Thu, 16 Jun 2022 12:09:10 -0700
> >> Cc: Eli Zaretskii <eliz@gnu.org>,
> >> Stefan Monnier <monnier@iro.umontreal.ca>,
> >> Emacs Devel <emacs-devel@gnu.org>,
> >> Daniel Colascione <dancol@dancol.org>
> >> 
> >> I also added manual entries for traverse functions, please have a look ;-) I haven’t add manual for treesit-search and treesit-defun-query, etc yet.
> > 
> > Could you perhaps post a "cookbook" like recipe for trying the branch
> > for some programming language?  I mean, what should a user do, after
> > building the branch, to start using the features backed by
> > tree-sitter?  I presume the list should include downloading parsers
> > (any recommendations for the relevant sites?), compiling them (or
> > maybe there are sites with precompiled ones?), then setting some
> > variables or data structures in Emacs or invoking some commands/modes?
> > 
> > I think something like that will be useful for more people to try the
> > branch, provide feedback, and facilitate making its more stable.
> 
> I do have an article that covers many of the topics you mentioned [1]. But it lacks “hands-on” details, and it is a bit out-dated (repository moved, we changed the prefix, etc). I can update it with, say, a starter guide on implementing a minimal tree-sitter C major mode. I’ll post back once its done.

Thanks, I think that would be useful.

> [1]: https://archive.casouri.cat/note/2021/emacs-tree-sitter/index.html

This seems to indicate we should provide more higher-level features to
make use of tree-sitter related features easier.  The instruction
there sound like a typical user will need to do a lot before he/she
can see any practical benefit from this build.

Maybe updating it will solve some of that.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-17  6:19                                                               ` Eli Zaretskii
@ 2022-06-17  7:32                                                                 ` Yuan Fu
  2022-06-17 10:42                                                                   ` Eli Zaretskii
  2022-06-17 18:12                                                                   ` Yoav Marco
  0 siblings, 2 replies; 187+ messages in thread
From: Yuan Fu @ 2022-06-17  7:32 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: Theodor Thornhill, monnier, emacs-devel, dancol



> On Jun 16, 2022, at 11:19 PM, Eli Zaretskii <eliz@gnu.org> wrote:
> 
>> From: Yuan Fu <casouri@gmail.com>
>> Date: Thu, 16 Jun 2022 12:09:10 -0700
>> Cc: Eli Zaretskii <eliz@gnu.org>,
>> Stefan Monnier <monnier@iro.umontreal.ca>,
>> Emacs Devel <emacs-devel@gnu.org>,
>> Daniel Colascione <dancol@dancol.org>
>> 
>> I also added manual entries for traverse functions, please have a look ;-) I haven’t add manual for treesit-search and treesit-defun-query, etc yet.
> 
> Could you perhaps post a "cookbook" like recipe for trying the branch
> for some programming language?  I mean, what should a user do, after
> building the branch, to start using the features backed by
> tree-sitter?  I presume the list should include downloading parsers
> (any recommendations for the relevant sites?), compiling them (or
> maybe there are sites with precompiled ones?), then setting some
> variables or data structures in Emacs or invoking some commands/modes?
> 
> I think something like that will be useful for more people to try the
> branch, provide feedback, and facilitate making its more stable.

I do have an article that covers many of the topics you mentioned [1]. But it lacks “hands-on” details, and it is a bit out-dated (repository moved, we changed the prefix, etc). I can update it with, say, a starter guide on implementing a minimal tree-sitter C major mode. I’ll post back once its done.

[1]: https://archive.casouri.cat/note/2021/emacs-tree-sitter/index.html

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-16 19:09                                                             ` Yuan Fu
@ 2022-06-17  6:19                                                               ` Eli Zaretskii
  2022-06-17  7:32                                                                 ` Yuan Fu
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-06-17  6:19 UTC (permalink / raw)
  To: Yuan Fu; +Cc: theo, monnier, emacs-devel, dancol

> From: Yuan Fu <casouri@gmail.com>
> Date: Thu, 16 Jun 2022 12:09:10 -0700
> Cc: Eli Zaretskii <eliz@gnu.org>,
>  Stefan Monnier <monnier@iro.umontreal.ca>,
>  Emacs Devel <emacs-devel@gnu.org>,
>  Daniel Colascione <dancol@dancol.org>
> 
> I also added manual entries for traverse functions, please have a look ;-) I haven’t add manual for treesit-search and treesit-defun-query, etc yet.

Could you perhaps post a "cookbook" like recipe for trying the branch
for some programming language?  I mean, what should a user do, after
building the branch, to start using the features backed by
tree-sitter?  I presume the list should include downloading parsers
(any recommendations for the relevant sites?), compiling them (or
maybe there are sites with precompiled ones?), then setting some
variables or data structures in Emacs or invoking some commands/modes?

I think something like that will be useful for more people to try the
branch, provide feedback, and facilitate making its more stable.

Thanks.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-16 21:57                                           ` yoavm448
@ 2022-06-17  1:10                                             ` Yuan Fu
  0 siblings, 0 replies; 187+ messages in thread
From: Yuan Fu @ 2022-06-17  1:10 UTC (permalink / raw)
  To: yoavm448; +Cc: Eli Zaretskii, emacs-devel

[-- Attachment #1: Type: text/plain, Size: 544 bytes --]



> On Jun 16, 2022, at 2:57 PM, yoavm448 <yoavm448@gmail.com> wrote:
> 
> I tried to run the benchmarks again real quick, and ran into a segfault.
> It occurs in the call to ts_query_delete in cleanup_vector when
> garbage collecting.
> 
> I'll try to gather more info tomorrow, going to bed now.


Do you happen to run on a Mac? This issue seems to occur on a Mac but not Linux.

On a separate note, I made some mistake in changing the benchmark code (as you might have noticed), the correct one is attached.

Thanks,
Yuan


[-- Attachment #2: bench.zip --]
[-- Type: application/zip, Size: 309831 bytes --]

^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-16 19:16                                         ` Yuan Fu
@ 2022-06-16 21:57                                           ` yoavm448
  2022-06-17  1:10                                             ` Yuan Fu
  0 siblings, 1 reply; 187+ messages in thread
From: yoavm448 @ 2022-06-16 21:57 UTC (permalink / raw)
  To: Yuan Fu; +Cc: Eli Zaretskii, emacs-devel

I tried to run the benchmarks again real quick, and ran into a segfault.
It occurs in the call to ts_query_delete in cleanup_vector when
garbage collecting.

I'll try to gather more info tomorrow, going to bed now.

  Yoav



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-14  0:04                                       ` Yuan Fu
@ 2022-06-16 19:16                                         ` Yuan Fu
  2022-06-16 21:57                                           ` yoavm448
  0 siblings, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-06-16 19:16 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: Yoav Marco, emacs-devel

[-- Attachment #1: Type: text/plain, Size: 1335 bytes --]



> On May 13, 2022, at 5:04 PM, Yuan Fu <casouri@gmail.com> wrote:
> 
> 
> 
>> On May 13, 2022, at 3:41 AM, Eli Zaretskii <eliz@gnu.org> wrote:
>> 
>>> From: Yoav Marco <yoavm448@gmail.com>
>>> Cc: casouri@gmail.com, emacs-devel@gnu.org
>>> Date: Fri, 13 May 2022 11:42:04 +0300
>>> 
>>> Eli Zaretskii <eliz@gnu.org> writes:
>>> 
>>>> Is it true that there's just one query for each PL mode, and it is
>>>> fixed (doesn't change) and doesn't depend on the buffer contents in
>>>> any way? If that is true, the major mode could compile the query
>>>> whenever it is initialized, and then reuse it in every buffer that is
>>>> under that major mode.
>>> 
>>> It's correct, though there might be more than one if a mode wants to
>>> offer fontification options users can toggle. But yeah, the major mode
>>> could compile its queries when initialized. I'm in favor of this too.
>> 
>> Then let's do that. Yuan, are there any issues with implementing
>> this?
> 
> No, it’s fairly straightforward. Added to todo-list ;-)

I’ve added support for compiling queries. Try this new benchmark that I’ve attached. You can see three forms, each for tree-sitter, tree-sitter compiled, and font-lock. The compiled one should be fairly fast. I haven’t wrote benchmarks that can be added to tests yet.

Yuan


[-- Attachment #2: bench.zip --]
[-- Type: application/zip, Size: 309870 bytes --]

^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-14 18:50     ` Daniel Martín
  2022-05-14 19:09       ` Eli Zaretskii
@ 2022-06-16 19:10       ` Yuan Fu
  1 sibling, 0 replies; 187+ messages in thread
From: Yuan Fu @ 2022-06-16 19:10 UTC (permalink / raw)
  To: Daniel Martín; +Cc: Emacs Devel



> On May 14, 2022, at 11:50 AM, Daniel Martín <mardani29@yahoo.es> wrote:
> 
> Yuan Fu <casouri@gmail.com> writes:
>>> 
>>> Thanks for working on this.  What is the best way to report
>>> problems/contribute patches?  
>> 
>> Thanks, you can just send a patch to me or open a report on debbugs and send a patch there.
>> 
> 
> Thanks, I'll send patches to Debbugs from now on.
> 
>> 
>>> I've tried to build the branch and got a
>>> "file not found" error when including <tree_sitter/api.h> (I have
>>> tree-sitter correctly installed via Homebrew on macOS).  I've fixed the
>>> problem with the following patch:
>>> 
>>> diff --git a/configure.ac b/configure.ac
>>> index bf97dd017c..5a82d47db3 100644
>>> --- a/configure.ac
>>> +++ b/configure.ac
>>> @@ -3115,7 +3115,6 @@ AC_DEFUN
>>>    [HAVE_TREE_SITTER=yes], [HAVE_TREE_SITTER=no])
>>>  if test "${HAVE_TREE_SITTER}" = yes; then
>>>    AC_DEFINE(HAVE_TREE_SITTER, 1, [Define if using tree-sitter.])
>>> -    TREE_SITTER_LIBS=-ltree-sitter
>>>    TREE_SITTER_OBJ="treesit.o"
>>>  fi
>>> fi
>> 
>> Could you explain a bit why removing this line works for you? And what specific problem are you solving? I’m not so savvy in autotools.
>> 
> 
> The problem I tried to solve was this linker error:
> 
>  ld: library not found for -ltree-sitter
> 
> The reason is that the custom library directory on my system,
> /opt/homebrew/Cellar/, needs to be set via -L, but setting
> TREE_SITTER_LIBS overrided that.  Pkg-config already provides the
> necessary linker flags to use the library.
> 
> With my change, I get the following in config.log:
> 
> TREE_SITTER_LIBS='-L/opt/homebrew/Cellar/tree-sitter/0.20.6/lib -ltree-sitter'
> 
> which is similar, for example, to how Jansson is linked:
> 
> JSON_LIBS='-L/opt/homebrew/Cellar/jansson/2.14/lib -ljansson'
> 
> and all compiles and links successfully.
> 
> N.B: I don't know if this would work on MS-Windows.  Perhaps MS-Windows
> loads tree-sitter dynamically and will need a special case that just
> unsets TREE_SITTER_LIBS.

Thanks, I’ve removed that line from configure.ac.

Yuan




^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-18 21:07                                                           ` Theodor Thornhill
@ 2022-06-16 19:09                                                             ` Yuan Fu
  2022-06-17  6:19                                                               ` Eli Zaretskii
  0 siblings, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-06-16 19:09 UTC (permalink / raw)
  To: Theodor Thornhill
  Cc: Eli Zaretskii, Stefan Monnier, Emacs Devel, Daniel Colascione



> On May 18, 2022, at 2:07 PM, Theodor Thornhill <theo@thornhill.no> wrote:
> 
>> 
>> Thanks, I think it could be very useful. I can add an option for the
>> user of treesit-traverse-depth-first to control the depth it goes. And
>> same for treesit-traverse-forward-depth-first. A relative depth of 0
>> could mean only traverse siblings and parent, nil means traverse all
>> the way, a positive number n means traverse n steps down.
>> 
> 
> Nice, thanks!

I added a new parameter DEPTH to treesit-traverse-depth-first and treesit-traverse-forward-depth-first (renamed to treesit-traverse-forward).

I also realized that the slowness is due to treesit-query-capture, not the number of nodes we traversed. So utilizing the DEPTH parameter only has semantic advantage. You should compile the query (a new feature I added) and use the compiled query for treesit-defun-query. With that, no matter the value of DEPTH, jumping to defun begin/end is always fast.

I also added manual entries for traverse functions, please have a look ;-) I haven’t add manual for treesit-search and treesit-defun-query, etc yet.

> 
> Also, discovered a typo, and shielded the defuns with a fallback value,
> see provided patch:

Also merged, thanks.

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
@ 2022-05-19  1:35 Kiong-Ge Liau
  0 siblings, 0 replies; 187+ messages in thread
From: Kiong-Ge Liau @ 2022-05-19  1:35 UTC (permalink / raw)
  To: casouri, emacs-devel

Can you please share the mentioned "treesit-demo.el" file? I cannot it
attached to any message on emacs-devel maliing list.

Thanks.







^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-18 20:52                                                         ` Yuan Fu
@ 2022-05-18 21:07                                                           ` Theodor Thornhill
  2022-06-16 19:09                                                             ` Yuan Fu
  0 siblings, 1 reply; 187+ messages in thread
From: Theodor Thornhill @ 2022-05-18 21:07 UTC (permalink / raw)
  To: Yuan Fu; +Cc: Eli Zaretskii, Stefan Monnier, Emacs Devel, Daniel Colascione

[-- Attachment #1: Type: text/plain, Size: 465 bytes --]

>
> Thanks, I think it could be very useful. I can add an option for the
> user of treesit-traverse-depth-first to control the depth it goes. And
> same for treesit-traverse-forward-depth-first. A relative depth of 0
> could mean only traverse siblings and parent, nil means traverse all
> the way, a positive number n means traverse n steps down.
>

Nice, thanks!

Also, discovered a typo, and shielded the defuns with a fallback value,
see provided patch:

Theo


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-Add-fallback-value-for-arg.patch --]
[-- Type: text/x-diff, Size: 1400 bytes --]

From b1ef6a6ab06feaae26594206eaa9c93392353ab6 Mon Sep 17 00:00:00 2001
From: Theodor Thornhill <theo@thornhill.no>
Date: Wed, 18 May 2022 23:03:55 +0200
Subject: [PATCH] Add fallback value for arg

Also, fix typo in docstring
---
 lisp/treesit.el | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lisp/treesit.el b/lisp/treesit.el
index 3313168d66..761c7147a0 100644
--- a/lisp/treesit.el
+++ b/lisp/treesit.el
@@ -927,20 +927,20 @@ treesit-beginning-of-defun
 
 With ARG, do it that many times.  Negative ARG means move forward
 to the ARGth following beginning of defun.  Defun is defined
-according to `treesit-defun-pattern'."
+according to `treesit-defun-query'."
   (unless treesit-defun-query
     (error "Variable `treesit-defun-query' is unset"))
-  (treesit-search-beginning treesit-defun-query (- arg)))
+  (treesit-search-beginning treesit-defun-query (- (or arg 1))))
 
 (defun treesit-end-of-defun (&optional arg)
   "Move forward to the end of a defun.
 
 With ARG, do it that many times.  Negative ARG means move back to
 ARGth preceding end of defun.  Defun is defined according to
-`treesit-defun-pattern'."
+`treesit-defun-query'."
   (unless treesit-defun-query
     (error "Variable `treesit-defun-query' is unset"))
-  (treesit-search-end treesit-defun-query arg))
+  (treesit-search-end treesit-defun-query (or arg 1)))
 
 ;;; Debugging
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-17 21:45                                                       ` Theodor Thornhill
@ 2022-05-18 20:52                                                         ` Yuan Fu
  2022-05-18 21:07                                                           ` Theodor Thornhill
  0 siblings, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-05-18 20:52 UTC (permalink / raw)
  To: Theodor Thornhill
  Cc: Eli Zaretskii, Stefan Monnier, Emacs Devel, Daniel Colascione



> On May 17, 2022, at 2:45 PM, Theodor Thornhill <theo@thornhill.no> wrote:
> 
> Yuan Fu <casouri@gmail.com> writes:
> 
>>> On May 13, 2022, at 10:03 PM, Theodor Thornhill <theo@thornhill.no> wrote:
>>> 
>>>> 
>>>> Now there is treesit-beginning/end-of-defun. You just need to set treesit-defun-query and everything else come for free. I needed to invent some heavy machinery for that, resulting in some new handy functions:
>>>> 
>>>> - treesit-traverse-depth-first
>>>> - treesit-traverse-breadth-first
>>>> - treesit-traverse-forward-depth-first (maybe this should be named simply treesit-traverse-forward?)
>>>> 
>>>> - treesit-search-forward
>>>> - treesit-search-beginning
>>>> They are untested & undocumented (in manual), so please play with them and report problems :-)
>>>> 
> 
> I've been testing the provided functionality for beginning/end-of-defun,
> and I have some thoughts I'd like to share.
> 
> For starters, let me just give some context.  The implementation I've
> used so far before the provided version looks something like
> ```
> (defun typescript-mode-move-to-node (fn)
>  (when-let ((found-node
>              (treesit-parent-until
>               (treesit-node-at (point))
>               (lambda (parent)
>                 (treesit-query-capture
>                  parent
>                  typescript-mode--defun-query)))))
>    (goto-char (funcall fn found-node))))
> 
> (defun typescript-mode-beginning-of-defun (&optional arg)
>  (typescript-mode-move-to-node #'treesit-node-start))
> 
> (defun typescript-mode-end-of-defun (&optional arg)
>  (typescript-mode-move-to-node #'treesit-node-end))
> ```
> 
> If this is given a query such as
> 
> ```
> (defvar typescript-mode--defun-query
>  "[(import_statement)
>    (function_declaration)
>    (type_alias_declaration)
>    (interface_declaration)
>    (lexical_declaration)] @defun")
> ```
> 
> we would traverse parentwise and locate a node on match.  This
> implementation is very fast, but has an issue - it will only match in
> the parentwise path, so siblings will not be found.  This makes my
> function useful, but not general enough.  The version provided in-tree
> right now uses the depth first approach, which has two big problems -
> performance and inconsistency.
> 
> Its docstring notes:
> ```
> Traversing forward depth-first means, for a tree like the below
> where NODE is marked 1, traverse as numbered:
> 
>                16
>                |
>       3--------4-----------8
>       |        |           |
>  o--o-+--1  5--+--6    9---+-----12
>  |  |    |        |    |         |
>  o  o    2        7  +-+-+    +--+--+
>                      |   |    |  |  |
>                      10  11   13 14 15
> ```
> 
> This means that if we start at node 1, I'd expect us to navigate to the
> nodes 3 - 4 - 8 - 16, when repeatedly pressing the beginning-of-defun.
> However, because we go depth first, we can end up landing at say, node
> 14, which feels unnatural.  This can happen for example in javascript if
> we add arrow_function to the nodes to match.  If node 14 contains such a
> node, the traversing order would look like this: 3 - 4 - 8 - 14 - 16.
> This feels odd, or at least differs from how normal emacs operates.  In
> addition, when the search gets long, it can take up to a second on my
> system to find the beginning of a defun, because of the amount of
> traversing required by the depth first algorithm.
> 
> I have a suggestion for a solution that you may consider.
> 
> Either add a new defcustom 'treesit-depth-first-go-deep', or add a new
> param to 'treesit-traverse-depth-first', like so:
> ```
> (defun treesit-traverse-depth-first (node pred &optional step go-deep)
>  (if (funcall pred node)
>      node
>    (and go-deep
>      (cl-loop for child in (if (or (null step) (>= step 0))
>                              (treesit-node-children node)
>                            (nreverse (treesit-node-children node)))
>             if (treesit-traverse-depth-first child pred step)
>             return child))))
> ```
> 
> This way we can avoid traversing deep into the subtrees, which is a slow
> operation, _and_ makes for an inconsistent experience.  Setting go-deep
> as nil makes the function really fast, and also keeps the benefit of
> finding siblings.
> 
> Another option is to not provide a generic depth first algorithm, and
> only go for siblings and parents, but we may want the depth first for
> other things, such as a generic 'treesit-goto-thing' function.
> 
> What do you think?

Thanks, I think it could be very useful. I can add an option for the user of treesit-traverse-depth-first to control the depth it goes. And same for treesit-traverse-forward-depth-first. A relative depth of 0 could mean only traverse siblings and parent, nil means traverse all the way, a positive number n means traverse n steps down.

Yuan




^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-14  5:13                                                     ` Yuan Fu
@ 2022-05-17 21:45                                                       ` Theodor Thornhill
  2022-05-18 20:52                                                         ` Yuan Fu
  0 siblings, 1 reply; 187+ messages in thread
From: Theodor Thornhill @ 2022-05-17 21:45 UTC (permalink / raw)
  To: Yuan Fu; +Cc: Eli Zaretskii, Stefan Monnier, Emacs Devel, Daniel Colascione

Yuan Fu <casouri@gmail.com> writes:

>> On May 13, 2022, at 10:03 PM, Theodor Thornhill <theo@thornhill.no> wrote:
>> 
>>> 
>>> Now there is treesit-beginning/end-of-defun. You just need to set treesit-defun-query and everything else come for free. I needed to invent some heavy machinery for that, resulting in some new handy functions:
>>> 
>>> - treesit-traverse-depth-first
>>> - treesit-traverse-breadth-first
>>> - treesit-traverse-forward-depth-first (maybe this should be named simply treesit-traverse-forward?)
>>> 
>>> - treesit-search-forward
>>> - treesit-search-beginning
>>> They are untested & undocumented (in manual), so please play with them and report problems :-)
>>> 

I've been testing the provided functionality for beginning/end-of-defun,
and I have some thoughts I'd like to share.

For starters, let me just give some context.  The implementation I've
used so far before the provided version looks something like
```
(defun typescript-mode-move-to-node (fn)
  (when-let ((found-node
              (treesit-parent-until
               (treesit-node-at (point))
               (lambda (parent)
                 (treesit-query-capture
                  parent
                  typescript-mode--defun-query)))))
    (goto-char (funcall fn found-node))))

(defun typescript-mode-beginning-of-defun (&optional arg)
  (typescript-mode-move-to-node #'treesit-node-start))

(defun typescript-mode-end-of-defun (&optional arg)
  (typescript-mode-move-to-node #'treesit-node-end))
```

If this is given a query such as

```
(defvar typescript-mode--defun-query
  "[(import_statement)
    (function_declaration)
    (type_alias_declaration)
    (interface_declaration)
    (lexical_declaration)] @defun")
```

we would traverse parentwise and locate a node on match.  This
implementation is very fast, but has an issue - it will only match in
the parentwise path, so siblings will not be found.  This makes my
function useful, but not general enough.  The version provided in-tree
right now uses the depth first approach, which has two big problems -
performance and inconsistency.

Its docstring notes:
```
Traversing forward depth-first means, for a tree like the below
where NODE is marked 1, traverse as numbered:

                16
                |
       3--------4-----------8
       |        |           |
  o--o-+--1  5--+--6    9---+-----12
  |  |    |        |    |         |
  o  o    2        7  +-+-+    +--+--+
                      |   |    |  |  |
                      10  11   13 14 15
```

This means that if we start at node 1, I'd expect us to navigate to the
nodes 3 - 4 - 8 - 16, when repeatedly pressing the beginning-of-defun.
However, because we go depth first, we can end up landing at say, node
14, which feels unnatural.  This can happen for example in javascript if
we add arrow_function to the nodes to match.  If node 14 contains such a
node, the traversing order would look like this: 3 - 4 - 8 - 14 - 16.
This feels odd, or at least differs from how normal emacs operates.  In
addition, when the search gets long, it can take up to a second on my
system to find the beginning of a defun, because of the amount of
traversing required by the depth first algorithm.

I have a suggestion for a solution that you may consider.

Either add a new defcustom 'treesit-depth-first-go-deep', or add a new
param to 'treesit-traverse-depth-first', like so:
```
(defun treesit-traverse-depth-first (node pred &optional step go-deep)
  (if (funcall pred node)
      node
    (and go-deep
      (cl-loop for child in (if (or (null step) (>= step 0))
                              (treesit-node-children node)
                            (nreverse (treesit-node-children node)))
             if (treesit-traverse-depth-first child pred step)
             return child))))
```

This way we can avoid traversing deep into the subtrees, which is a slow
operation, _and_ makes for an inconsistent experience.  Setting go-deep
as nil makes the function really fast, and also keeps the benefit of
finding siblings.

Another option is to not provide a generic depth first algorithm, and
only go for siblings and parents, but we may want the depth first for
other things, such as a generic 'treesit-goto-thing' function.

What do you think?

Theodor



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-15 19:20       ` chad
@ 2022-05-15 19:26         ` Eli Zaretskii
  0 siblings, 0 replies; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-15 19:26 UTC (permalink / raw)
  To: chad; +Cc: casouri, yoavm448, emacs-devel

> From: chad <yandros@gmail.com>
> Date: Sun, 15 May 2022 15:20:18 -0400
> Cc: Yoav Marco <yoavm448@gmail.com>,
>  EMACS development team <emacs-devel@gnu.org>
> 
> While it's true that Emacs doesn't fontify whole buffers under the current scheme, based on what I've read
> of tree-sitter (caveat: a fairly long while back), I think it might be reasonable to move to a
> whole-buffer-and-repair strategy for emacs+treesit, and that might provide some nice simplification along the
> way.

AFAIU, according to benchmarks shown in this thread, this strategy
will lead to several seconds of delay in response, at least for the
initial display.  I think this is too much; we should try getting rid
of such long delays.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-10 17:54     ` Yuan Fu
  2022-05-10 18:18       ` Yoav Marco
@ 2022-05-15 19:20       ` chad
  2022-05-15 19:26         ` Eli Zaretskii
  1 sibling, 1 reply; 187+ messages in thread
From: chad @ 2022-05-15 19:20 UTC (permalink / raw)
  To: Yuan Fu; +Cc: Yoav Marco, EMACS development team

[-- Attachment #1: Type: text/plain, Size: 746 bytes --]

Please forgive the late mid-thread reply; I was on the road.

On Tue, May 10, 2022 at 1:56 PM Yuan Fu <casouri@gmail.com> wrote:

> Yeah using a single cache would probably result in a lot of misses since
> Emacs don’t fontify the whole buffer at once.
>

While it's true that Emacs doesn't fontify whole buffers under the current
scheme, based on what I've read of tree-sitter (caveat: a fairly long while
back), I think it might be reasonable to move to a whole-buffer-and-repair
strategy for emacs+treesit, and that might provide some nice simplification
along the way. Is there a good way to try this strategy now? I avoid JS
whenever possible, and that seems like it might be the current-best test
case...

Thanks,
~Chad

[-- Attachment #2: Type: text/html, Size: 1097 bytes --]

^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-14 18:50     ` Daniel Martín
@ 2022-05-14 19:09       ` Eli Zaretskii
  2022-06-16 19:10       ` Yuan Fu
  1 sibling, 0 replies; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-14 19:09 UTC (permalink / raw)
  To: Daniel Martín; +Cc: casouri, emacs-devel

> From: Daniel Martín <mardani29@yahoo.es>
> Cc: Emacs Devel <emacs-devel@gnu.org>
> Date: Sat, 14 May 2022 20:50:39 +0200
> 
> The problem I tried to solve was this linker error:
> 
>   ld: library not found for -ltree-sitter
> 
> The reason is that the custom library directory on my system,
> /opt/homebrew/Cellar/, needs to be set via -L, but setting
> TREE_SITTER_LIBS overrided that.  Pkg-config already provides the
> necessary linker flags to use the library.
> 
> With my change, I get the following in config.log:
> 
> TREE_SITTER_LIBS='-L/opt/homebrew/Cellar/tree-sitter/0.20.6/lib -ltree-sitter'
> 
> which is similar, for example, to how Jansson is linked:
> 
> JSON_LIBS='-L/opt/homebrew/Cellar/jansson/2.14/lib -ljansson'
> 
> and all compiles and links successfully.

The compiler and linker switches for linking against the library are
supposed to come from pkg-config (which gets them from tree-sitter.pc
file that is part of the tree-sitter library's installation).

> N.B: I don't know if this would work on MS-Windows.  Perhaps MS-Windows
> loads tree-sitter dynamically and will need a special case that just
> unsets TREE_SITTER_LIBS.

Don't worry about Windows, we will figure this out when there's a
first Emacs user who wants to build that branch on Windows.  And
initially, there's nothing wrong with linking against the library
statically even on Windows: the resulting binary will work.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-14 15:55   ` Yuan Fu
@ 2022-05-14 18:50     ` Daniel Martín
  2022-05-14 19:09       ` Eli Zaretskii
  2022-06-16 19:10       ` Yuan Fu
  0 siblings, 2 replies; 187+ messages in thread
From: Daniel Martín @ 2022-05-14 18:50 UTC (permalink / raw)
  To: Yuan Fu; +Cc: Emacs Devel

Yuan Fu <casouri@gmail.com> writes:
>> 
>> Thanks for working on this.  What is the best way to report
>> problems/contribute patches?  
>
> Thanks, you can just send a patch to me or open a report on debbugs and send a patch there.
>

Thanks, I'll send patches to Debbugs from now on.

>
>> I've tried to build the branch and got a
>> "file not found" error when including <tree_sitter/api.h> (I have
>> tree-sitter correctly installed via Homebrew on macOS).  I've fixed the
>> problem with the following patch:
>> 
>> diff --git a/configure.ac b/configure.ac
>> index bf97dd017c..5a82d47db3 100644
>> --- a/configure.ac
>> +++ b/configure.ac
>> @@ -3115,7 +3115,6 @@ AC_DEFUN
>>     [HAVE_TREE_SITTER=yes], [HAVE_TREE_SITTER=no])
>>   if test "${HAVE_TREE_SITTER}" = yes; then
>>     AC_DEFINE(HAVE_TREE_SITTER, 1, [Define if using tree-sitter.])
>> -    TREE_SITTER_LIBS=-ltree-sitter
>>     TREE_SITTER_OBJ="treesit.o"
>>   fi
>> fi
>
> Could you explain a bit why removing this line works for you? And what specific problem are you solving? I’m not so savvy in autotools.
>

The problem I tried to solve was this linker error:

  ld: library not found for -ltree-sitter

The reason is that the custom library directory on my system,
/opt/homebrew/Cellar/, needs to be set via -L, but setting
TREE_SITTER_LIBS overrided that.  Pkg-config already provides the
necessary linker flags to use the library.

With my change, I get the following in config.log:

TREE_SITTER_LIBS='-L/opt/homebrew/Cellar/tree-sitter/0.20.6/lib -ltree-sitter'

which is similar, for example, to how Jansson is linked:

JSON_LIBS='-L/opt/homebrew/Cellar/jansson/2.14/lib -ljansson'

and all compiles and links successfully.

N.B: I don't know if this would work on MS-Windows.  Perhaps MS-Windows
loads tree-sitter dynamically and will need a special case that just
unsets TREE_SITTER_LIBS.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-14 15:09 ` Daniel Martín
@ 2022-05-14 15:55   ` Yuan Fu
  2022-05-14 18:50     ` Daniel Martín
  0 siblings, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-05-14 15:55 UTC (permalink / raw)
  To: Daniel Martín; +Cc: Emacs Devel



> On May 14, 2022, at 8:09 AM, Daniel Martín <mardani29@yahoo.es> wrote:
> 
> Yuan Fu <casouri@gmail.com> writes:
> 
>> Guys,
>> 
>> I’ve pushed the tree-sitter integration to feature/tree-sitter. If anyone want to give it a try:
>> - get tree-sitter from GitHub (or your package manager, make sure the version is at least 0.20.2)
>> https://github.com/tree-sitter/tree-sitter
>> - pull and build the branch
>> - read (elisp)Parsing Program Source
>> - grab language definitions from https://github.com/casouri/tree-sitter-module
>> You can either build with my script, or download the prebuilt ones
>> - play with it
>> 
>> Also apologize in advance for that my response might be slow until like June :-)
>> 
>> Yuan
> 
> Thanks for working on this.  What is the best way to report
> problems/contribute patches?  

Thanks, you can just send a patch to me or open a report on debbugs and send a patch there.


> I've tried to build the branch and got a
> "file not found" error when including <tree_sitter/api.h> (I have
> tree-sitter correctly installed via Homebrew on macOS).  I've fixed the
> problem with the following patch:
> 
> diff --git a/configure.ac b/configure.ac
> index bf97dd017c..5a82d47db3 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -3115,7 +3115,6 @@ AC_DEFUN
>     [HAVE_TREE_SITTER=yes], [HAVE_TREE_SITTER=no])
>   if test "${HAVE_TREE_SITTER}" = yes; then
>     AC_DEFINE(HAVE_TREE_SITTER, 1, [Define if using tree-sitter.])
> -    TREE_SITTER_LIBS=-ltree-sitter
>     TREE_SITTER_OBJ="treesit.o"
>   fi
> fi

Could you explain a bit why removing this line works for you? And what specific problem are you solving? I’m not so savvy in autotools.


> diff --git a/src/Makefile.in b/src/Makefile.in
> index a21af42c0b..7533f25963 100644
> --- a/src/Makefile.in
> +++ b/src/Makefile.in
> @@ -344,7 +344,7 @@ JSON_CFLAGS =
> JSON_OBJ = @JSON_OBJ@
> 
> TREE_SITTER_LIBS = @TREE_SITTER_LIBS@
> -TREE_SITTER_FLAGS = @TREE_SITTER_FLAGS@
> +TREE_SITTER_CFLAGS = @TREE_SITTER_CFLAGS@
> TREE_SITTER_OBJ = @TREE_SITTER_OBJ@
> INTERVALS_H = dispextern.h intervals.h composite.h

That’s indeed a typo, thanks.

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-07  8:29 Yuan Fu
                   ` (2 preceding siblings ...)
  2022-05-07  9:04 ` Eli Zaretskii
@ 2022-05-14 15:09 ` Daniel Martín
  2022-05-14 15:55   ` Yuan Fu
  3 siblings, 1 reply; 187+ messages in thread
From: Daniel Martín @ 2022-05-14 15:09 UTC (permalink / raw)
  To: Yuan Fu; +Cc: Emacs Devel

Yuan Fu <casouri@gmail.com> writes:

> Guys,
>
> I’ve pushed the tree-sitter integration to feature/tree-sitter. If anyone want to give it a try:
> - get tree-sitter from GitHub (or your package manager, make sure the version is at least 0.20.2)
> https://github.com/tree-sitter/tree-sitter
> - pull and build the branch
> - read (elisp)Parsing Program Source
> - grab language definitions from https://github.com/casouri/tree-sitter-module
> You can either build with my script, or download the prebuilt ones
> - play with it
>
> Also apologize in advance for that my response might be slow until like June :-)
>
> Yuan

Thanks for working on this.  What is the best way to report
problems/contribute patches?  I've tried to build the branch and got a
"file not found" error when including <tree_sitter/api.h> (I have
tree-sitter correctly installed via Homebrew on macOS).  I've fixed the
problem with the following patch:

diff --git a/configure.ac b/configure.ac
index bf97dd017c..5a82d47db3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3115,7 +3115,6 @@ AC_DEFUN
     [HAVE_TREE_SITTER=yes], [HAVE_TREE_SITTER=no])
   if test "${HAVE_TREE_SITTER}" = yes; then
     AC_DEFINE(HAVE_TREE_SITTER, 1, [Define if using tree-sitter.])
-    TREE_SITTER_LIBS=-ltree-sitter
     TREE_SITTER_OBJ="treesit.o"
   fi
 fi
diff --git a/src/Makefile.in b/src/Makefile.in
index a21af42c0b..7533f25963 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -344,7 +344,7 @@ JSON_CFLAGS =
 JSON_OBJ = @JSON_OBJ@
 
 TREE_SITTER_LIBS = @TREE_SITTER_LIBS@
-TREE_SITTER_FLAGS = @TREE_SITTER_FLAGS@
+TREE_SITTER_CFLAGS = @TREE_SITTER_CFLAGS@
 TREE_SITTER_OBJ = @TREE_SITTER_OBJ@
 
 INTERVALS_H = dispextern.h intervals.h composite.h

I have already signed the FSF paperwork.  Thanks.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-14  5:03                                                   ` Theodor Thornhill
@ 2022-05-14  5:13                                                     ` Yuan Fu
  2022-05-17 21:45                                                       ` Theodor Thornhill
  0 siblings, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-05-14  5:13 UTC (permalink / raw)
  To: Theodor Thornhill
  Cc: Eli Zaretskii, Stefan Monnier, Emacs Devel, Daniel Colascione



> On May 13, 2022, at 10:03 PM, Theodor Thornhill <theo@thornhill.no> wrote:
> 
>> 
>> Now there is treesit-beginning/end-of-defun. You just need to set treesit-defun-query and everything else come for free. I needed to invent some heavy machinery for that, resulting in some new handy functions:
>> 
>> - treesit-traverse-depth-first
>> - treesit-traverse-breadth-first
>> - treesit-traverse-forward-depth-first (maybe this should be named simply treesit-traverse-forward?)
>> 
>> - treesit-search-forward
>> - treesit-search-beginning
>> - treesit-search-end
>> 
> 
> I cannot find them on the branch - did you push them?

My bad, I pushed to the wrong repo. Now they should be up.

> 
>> They are untested & undocumented (in manual), so please play with them and report problems :-)
>> 
> 
> Will do :)

Thanks!

Yuan




^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-14  0:03                                                 ` Yuan Fu
@ 2022-05-14  5:03                                                   ` Theodor Thornhill
  2022-05-14  5:13                                                     ` Yuan Fu
  0 siblings, 1 reply; 187+ messages in thread
From: Theodor Thornhill @ 2022-05-14  5:03 UTC (permalink / raw)
  To: Yuan Fu; +Cc: Eli Zaretskii, Stefan Monnier, Emacs Devel, Daniel Colascione

>
> Now there is treesit-beginning/end-of-defun. You just need to set treesit-defun-query and everything else come for free. I needed to invent some heavy machinery for that, resulting in some new handy functions:
>
> - treesit-traverse-depth-first
> - treesit-traverse-breadth-first
> - treesit-traverse-forward-depth-first (maybe this should be named simply treesit-traverse-forward?)
>
> - treesit-search-forward
> - treesit-search-beginning
> - treesit-search-end
>

I cannot find them on the branch - did you push them?

> They are untested & undocumented (in manual), so please play with them and report problems :-)
>

Will do :)

Theo



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-13 10:41                                     ` Eli Zaretskii
@ 2022-05-14  0:04                                       ` Yuan Fu
  2022-06-16 19:16                                         ` Yuan Fu
  0 siblings, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-05-14  0:04 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: Yoav Marco, emacs-devel



> On May 13, 2022, at 3:41 AM, Eli Zaretskii <eliz@gnu.org> wrote:
> 
>> From: Yoav Marco <yoavm448@gmail.com>
>> Cc: casouri@gmail.com, emacs-devel@gnu.org
>> Date: Fri, 13 May 2022 11:42:04 +0300
>> 
>> Eli Zaretskii <eliz@gnu.org> writes:
>> 
>>> Is it true that there's just one query for each PL mode, and it is
>>> fixed (doesn't change) and doesn't depend on the buffer contents in
>>> any way? If that is true, the major mode could compile the query
>>> whenever it is initialized, and then reuse it in every buffer that is
>>> under that major mode.
>> 
>> It's correct, though there might be more than one if a mode wants to
>> offer fontification options users can toggle. But yeah, the major mode
>> could compile its queries when initialized. I'm in favor of this too.
> 
> Then let's do that. Yuan, are there any issues with implementing
> this?

No, it’s fairly straightforward. Added to todo-list ;-)

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-09 21:33                                               ` Theodor Thornhill
@ 2022-05-14  0:03                                                 ` Yuan Fu
  2022-05-14  5:03                                                   ` Theodor Thornhill
  0 siblings, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-05-14  0:03 UTC (permalink / raw)
  To: Theodor Thornhill
  Cc: Eli Zaretskii, Stefan Monnier, Emacs Devel, Daniel Colascione



> On May 9, 2022, at 2:33 PM, Theodor Thornhill <theo@thornhill.no> wrote:
> 
>> 
>> +(defun js-treesit-move-to-node (fn)
>> + (when-let ((found-node (treesit-parent-until
>> + (treesit-node-at (point) (point) 'javascript)
>> + (lambda (parent)
>> + (let ((parent-type (treesit-node-type parent)))
>> + (or (equal "function_declaration" parent-type)
>> + ;;; More declarations here
>> + ))))))
>> + (goto-char (funcall fn found-node))))
>> +
>> +(defun js-treesit-beginning-of-defun (&optional arg)
>> + (js-treesit-move-to-node #'treesit-node-start))
>> +
>> +(defun js-treesit-end-of-defun (&optional arg)
>> + (js-treesit-move-to-node #'treesit-node-end))
>> 
>> Maybe I could extract this into treesit.el, so major modes can specify
>> simply the node name for a function definition and get function
>> traversal for free.
>> 
> 
> Yeah! My plan was to use `M-a` and `M-e` for siblings and `C-M-a` and
> `C-M-e` for beginning/end-of-defun. Some way of supplying "defun-nodes"
> and make treesit handle the rest would be awesome.

Now there is treesit-beginning/end-of-defun. You just need to set treesit-defun-query and everything else come for free. I needed to invent some heavy machinery for that, resulting in some new handy functions:

- treesit-traverse-depth-first
- treesit-traverse-breadth-first
- treesit-traverse-forward-depth-first (maybe this should be named simply treesit-traverse-forward?)

- treesit-search-forward
- treesit-search-beginning
- treesit-search-end

They are untested & undocumented (in manual), so please play with them and report problems :-)

(BREAKING) I also changed the semantic of treesit-node-at, the old semantic sometimes returns unexpected result and it is best to change it to something more intuitive. The old semantic can still be found in treesit-node-on.

Yuan




^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-13 10:37                                     ` Eli Zaretskii
@ 2022-05-13 10:52                                       ` Theodor Thornhill
  0 siblings, 0 replies; 187+ messages in thread
From: Theodor Thornhill @ 2022-05-13 10:52 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: yoavm448, casouri, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

>> From: Theodor Thornhill <theo@thornhill.no>
>> Cc: casouri@gmail.com, emacs-devel@gnu.org
>> Date: Fri, 13 May 2022 10:04:03 +0200
>> 
>> > Is it true that there's just one query for each PL mode, and it is
>> > fixed (doesn't change) and doesn't depend on the buffer contents in
>> > any way?  If that is true, the major mode could compile the query
>> > whenever it is initialized, and then reuse it in every buffer that is
>> > under that major mode.
>> >
>> 
>> Yes, for indentation and font locking, this is correct.  I'd think that
>> it'll be enough to compile on mode init, and just reuse it.  For some
>> hypothetical other uses, such as searching and replacing, we would need
>> to be more dynamic, but that won't have the performance issues that font
>> locking typically has.
>
> Right.
>
>> Why not use the same idea as the `eglot-managed-mode`, where if the
>> file fulfills some predicate, we choose to treat them all as equals.
>> Thus we only need to compile/read/use the queries once, and can
>> simply lookup what we need.
>
> We can do something like that if needed.  But I don't necessarily see
> the need yet.  When will we need this, if a major mode compiles the
> query once when it is first turned on in some buffer?
>

Thinking more about that, I don't think we do need it, considering it
will run as a major mode. I guess it may be interesting should we want
to supply tree sitter functionality as minor modes. Let's say that some
major mode author doesn't want to integrate with tree sitter, and won't
accept such a patch.  Then we could still allow overriding font locking
of that mode given the proper means to do so.  Another case is for minor
modes such as paredit and the likes.  It could support advanced editing
facilities without being part of the main tree sitter major mode
integration.  Then it would make sense to have a tree-sitter-minor mode
thing.  But this is all just ideas from the top of my head, and not
necessarily anything worth considering for the first implementation of
tree sitter.

>> > There isn't any (IIUC what you are asking).  Fontification is a
>> > feature of interactive sessions, and is basically meaningless without
>> > normal redisplay.
>> >
>> 
>> An ok benchmark would be using C-n rather than C-v, because that seems
>> to trigger more performance issues in my daily use.
>
> We should benchmark both, because both are important.
Agreed



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-13  8:42                                   ` Yoav Marco
@ 2022-05-13 10:41                                     ` Eli Zaretskii
  2022-05-14  0:04                                       ` Yuan Fu
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-13 10:41 UTC (permalink / raw)
  To: Yoav Marco; +Cc: casouri, emacs-devel

> From: Yoav Marco <yoavm448@gmail.com>
> Cc: casouri@gmail.com, emacs-devel@gnu.org
> Date: Fri, 13 May 2022 11:42:04 +0300
> 
> Eli Zaretskii <eliz@gnu.org> writes:
> 
> > Is it true that there's just one query for each PL mode, and it is
> > fixed (doesn't change) and doesn't depend on the buffer contents in
> > any way?  If that is true, the major mode could compile the query
> > whenever it is initialized, and then reuse it in every buffer that is
> > under that major mode.
> 
> It's correct, though there might be more than one if a mode wants to
> offer fontification options users can toggle. But yeah, the major mode
> could compile its queries when initialized. I'm in favor of this too.

Then let's do that.  Yuan, are there any issues with implementing
this?



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-13  8:04                                   ` Theodor Thornhill
  2022-05-13  8:36                                     ` Yoav Marco
@ 2022-05-13 10:37                                     ` Eli Zaretskii
  2022-05-13 10:52                                       ` Theodor Thornhill
  1 sibling, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-13 10:37 UTC (permalink / raw)
  To: Theodor Thornhill; +Cc: yoavm448, casouri, emacs-devel

> From: Theodor Thornhill <theo@thornhill.no>
> Cc: casouri@gmail.com, emacs-devel@gnu.org
> Date: Fri, 13 May 2022 10:04:03 +0200
> 
> > Is it true that there's just one query for each PL mode, and it is
> > fixed (doesn't change) and doesn't depend on the buffer contents in
> > any way?  If that is true, the major mode could compile the query
> > whenever it is initialized, and then reuse it in every buffer that is
> > under that major mode.
> >
> 
> Yes, for indentation and font locking, this is correct.  I'd think that
> it'll be enough to compile on mode init, and just reuse it.  For some
> hypothetical other uses, such as searching and replacing, we would need
> to be more dynamic, but that won't have the performance issues that font
> locking typically has.

Right.

> Why not use the same idea as the `eglot-managed-mode`, where if the
> file fulfills some predicate, we choose to treat them all as equals.
> Thus we only need to compile/read/use the queries once, and can
> simply lookup what we need.

We can do something like that if needed.  But I don't necessarily see
the need yet.  When will we need this, if a major mode compiles the
query once when it is first turned on in some buffer?

> > There isn't any (IIUC what you are asking).  Fontification is a
> > feature of interactive sessions, and is basically meaningless without
> > normal redisplay.
> >
> 
> An ok benchmark would be using C-n rather than C-v, because that seems
> to trigger more performance issues in my daily use.

We should benchmark both, because both are important.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-13  8:36                                     ` Yoav Marco
@ 2022-05-13  9:46                                       ` Theodor Thornhill
  0 siblings, 0 replies; 187+ messages in thread
From: Theodor Thornhill @ 2022-05-13  9:46 UTC (permalink / raw)
  To: Yoav Marco; +Cc: Eli Zaretskii, casouri, emacs-devel

Yoav Marco <yoavm448@gmail.com> writes:

> Theodor Thornhill <theo@thornhill.no> writes:
>
>> I don't think that
>> parsing a c file with go queries is at all interesting, because the
>> parser would return errors all over and is clearly not how it is
>> supposed to be used.
>
> That was accidental and only in the first benchmarks regarding nconc.
> Later benchmarks use xdisp.c with actual C queries, so not to worry.
>


Great - sorry for the noise then :)



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-13  6:34                                 ` Eli Zaretskii
  2022-05-13  8:04                                   ` Theodor Thornhill
@ 2022-05-13  8:42                                   ` Yoav Marco
  2022-05-13 10:41                                     ` Eli Zaretskii
  1 sibling, 1 reply; 187+ messages in thread
From: Yoav Marco @ 2022-05-13  8:42 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: casouri, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

> Is it true that there's just one query for each PL mode, and it is
> fixed (doesn't change) and doesn't depend on the buffer contents in
> any way?  If that is true, the major mode could compile the query
> whenever it is initialized, and then reuse it in every buffer that is
> under that major mode.

It's correct, though there might be more than one if a mode wants to
offer fontification options users can toggle. But yeah, the major mode
could compile its queries when initialized. I'm in favor of this too.

>> Okay, we can try that. What's the proper way to trigger a "natural
>> fontification" as would occur in the GUI without opening an interactive
>> session?
>
> There isn't any (IIUC what you are asking).  Fontification is a
> feature of interactive sessions, and is basically meaningless without
> normal redisplay.
>
>> I'd rather use the groundwork that's actually used by users,
>> and not get stuff like the JIT chunck size wrong. In general I'm not too
>> familiar with that part of Emacs; the benchmarks up to now used
>> with-temp-buffer, would that suffice for these new benchmarks?
>
> Using with-temp-buffer could cause problems, because not everything is
> set up as it would when actually visiting the file.  Why is
> with-temp-buffer necessary for the benchmarks?
>
> But if it turns out that a query doesn't depend on the buffer
> contents, I think this is a moot point, and the major mode could
> compile the query just once when its first loaded.

Yeah.

- Yoav



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-13  8:04                                   ` Theodor Thornhill
@ 2022-05-13  8:36                                     ` Yoav Marco
  2022-05-13  9:46                                       ` Theodor Thornhill
  2022-05-13 10:37                                     ` Eli Zaretskii
  1 sibling, 1 reply; 187+ messages in thread
From: Yoav Marco @ 2022-05-13  8:36 UTC (permalink / raw)
  To: Theodor Thornhill; +Cc: Eli Zaretskii, casouri, emacs-devel

Theodor Thornhill <theo@thornhill.no> writes:

> I don't think that
> parsing a c file with go queries is at all interesting, because the
> parser would return errors all over and is clearly not how it is
> supposed to be used.

That was accidental and only in the first benchmarks regarding nconc.
Later benchmarks use xdisp.c with actual C queries, so not to worry.

 - Yoav



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-13  6:34                                 ` Eli Zaretskii
@ 2022-05-13  8:04                                   ` Theodor Thornhill
  2022-05-13  8:36                                     ` Yoav Marco
  2022-05-13 10:37                                     ` Eli Zaretskii
  2022-05-13  8:42                                   ` Yoav Marco
  1 sibling, 2 replies; 187+ messages in thread
From: Theodor Thornhill @ 2022-05-13  8:04 UTC (permalink / raw)
  To: Eli Zaretskii, Yoav Marco; +Cc: casouri, emacs-devel

Hi and sorry to interject,

>> compile and run multiple queries.
>
> Is it true that there's just one query for each PL mode, and it is
> fixed (doesn't change) and doesn't depend on the buffer contents in
> any way?  If that is true, the major mode could compile the query
> whenever it is initialized, and then reuse it in every buffer that is
> under that major mode.
>

Yes, for indentation and font locking, this is correct.  I'd think that
it'll be enough to compile on mode init, and just reuse it.  For some
hypothetical other uses, such as searching and replacing, we would need
to be more dynamic, but that won't have the performance issues that font
locking typically has.  Why not use the same idea as the
`eglot-managed-mode`, where if the file fulfills some predicate, we
choose to treat them all as equals.  Thus we only need to
compile/read/use the queries once, and can simply lookup what we need.

The `treesit-font-lock-fontify-region` is the culprit here, and
could look up in an easier way, I think.

> There isn't any (IIUC what you are asking).  Fontification is a
> feature of interactive sessions, and is basically meaningless without
> normal redisplay.
>

An ok benchmark would be using C-n rather than C-v, because that seems
to trigger more performance issues in my daily use.  I don't think that
parsing a c file with go queries is at all interesting, because the
parser would return errors all over and is clearly not how it is
supposed to be used.

> But if it turns out that a query doesn't depend on the buffer
> contents, I think this is a moot point, and the major mode could
> compile the query just once when its first loaded.
>

Agreed.

Theodor



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-12 17:22                               ` Yoav Marco
@ 2022-05-13  6:34                                 ` Eli Zaretskii
  2022-05-13  8:04                                   ` Theodor Thornhill
  2022-05-13  8:42                                   ` Yoav Marco
  0 siblings, 2 replies; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-13  6:34 UTC (permalink / raw)
  To: Yoav Marco; +Cc: casouri, emacs-devel

> From: Yoav Marco <yoavm448@gmail.com>
> Cc: casouri@gmail.com, emacs-devel@gnu.org
> Date: Thu, 12 May 2022 20:22:30 +0300
> 
> > But maybe we should make this discussion more concrete.  Can you show
> > the queries and explain how they are produced from the font-lock rules
> > (or whatever else they are produced from)?  How many different queries
> > do we expect to have in a garden-variety major mode for a PL, and what
> > do they depend on?
> 
> So first of all, query is kind of an aggregate term, since one query
> string/sexp can contain many "query patterns". I expect most major modes
> to have one big query string/sexp, and maybe a handful more that are
> optional to users. treesit allows you to set as many query strings/sexps
> as you want for syntax highlighting. Outside of that, queries are also
> how packages like evil-textobj-tree-sitter work, with the backend of the
> elisp-tree-sitter which uses a dynamic module.
> 
> Queries are specific to the parse tree and therefore to the parser. Most
> parsers have a queries/highlights.scm file in their repo, and
> tree-sitter-langs contains a bunch of these:
> 
> > https://github.com/emacs-tree-sitter/tree-sitter-langs/#readme
> >
> > Highlighting query patterns for a language are in the file
> > queries/<lang>/highlights.scm. Most of them are intentionally
> > different from those from upstream repositories, which are more geared
> > towards GitHub’s use cases. We try to be more consistent with Emacs’s
> > existing conventions. (For some languages, this is WIP, so their
> > patterns may look similar to upstream’s.)
> 
> The query I used in the benchmarks is tree-sitter-langs's
> queries/c/highlights.scm, which is a rather big file. One thing to check
> that I only thought of now is how long it takes with treesit having to
> compile and run multiple queries.

Is it true that there's just one query for each PL mode, and it is
fixed (doesn't change) and doesn't depend on the buffer contents in
any way?  If that is true, the major mode could compile the query
whenever it is initialized, and then reuse it in every buffer that is
under that major mode.

If the above conclusion is not correct, then please tell what are the
differences between the query/queries of different buffers, and how do
they depend on the buffer contents.

> >   . the time it takes to visit xdisp.c and display the first window-full
> >   . visit xdisp.c, then immediately go to its end
> >   . C-v in xdisp.c (repeat many times to see how much a single C-v
> >     takes)
> 
> Okay, we can try that. What's the proper way to trigger a "natural
> fontification" as would occur in the GUI without opening an interactive
> session?

There isn't any (IIUC what you are asking).  Fontification is a
feature of interactive sessions, and is basically meaningless without
normal redisplay.

> I'd rather use the groundwork that's actually used by users,
> and not get stuff like the JIT chunck size wrong. In general I'm not too
> familiar with that part of Emacs; the benchmarks up to now used
> with-temp-buffer, would that suffice for these new benchmarks?

Using with-temp-buffer could cause problems, because not everything is
set up as it would when actually visiting the file.  Why is
with-temp-buffer necessary for the benchmarks?

But if it turns out that a query doesn't depend on the buffer
contents, I think this is a moot point, and the major mode could
compile the query just once when its first loaded.

Thanks.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-12 17:18                             ` Eli Zaretskii
@ 2022-05-12 17:22                               ` Yoav Marco
  2022-05-13  6:34                                 ` Eli Zaretskii
  0 siblings, 1 reply; 187+ messages in thread
From: Yoav Marco @ 2022-05-12 17:22 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: casouri, emacs-devel


Eli Zaretskii <eliz@gnu.org> writes:

>> From: Yoav Marco <yoavm448@gmail.com>
>> Cc: casouri@gmail.com, emacs-devel@gnu.org
>> Date: Thu, 12 May 2022 19:26:50 +0300
>>
>> How I understand it, if it takes 23.474s to fontify 2332 times without
>> query caching and 0.037s with, then 99.7% of the time is spent in
>> recompiling the same query, or (23.474 - 0.037)/2332 = 10ms per
>> fontification.
>
> Yes, and 10 ms is negligibly short.  So, while the relative speedup is
> very significant, I still don't see any reason for caching the
> queries.
>
> But maybe we should make this discussion more concrete.  Can you show
> the queries and explain how they are produced from the font-lock rules
> (or whatever else they are produced from)?  How many different queries
> do we expect to have in a garden-variety major mode for a PL, and what
> do they depend on?

So first of all, query is kind of an aggregate term, since one query
string/sexp can contain many "query patterns". I expect most major modes
to have one big query string/sexp, and maybe a handful more that are
optional to users. treesit allows you to set as many query strings/sexps
as you want for syntax highlighting. Outside of that, queries are also
how packages like evil-textobj-tree-sitter work, with the backend of the
elisp-tree-sitter which uses a dynamic module.

Queries are specific to the parse tree and therefore to the parser. Most
parsers have a queries/highlights.scm file in their repo, and
tree-sitter-langs contains a bunch of these:

> https://github.com/emacs-tree-sitter/tree-sitter-langs/#readme
>
> Highlighting query patterns for a language are in the file
> queries/<lang>/highlights.scm. Most of them are intentionally
> different from those from upstream repositories, which are more geared
> towards GitHub’s use cases. We try to be more consistent with Emacs’s
> existing conventions. (For some languages, this is WIP, so their
> patterns may look similar to upstream’s.)

The query I used in the benchmarks is tree-sitter-langs's
queries/c/highlights.scm, which is a rather big file. One thing to check
that I only thought of now is how long it takes with treesit having to
compile and run multiple queries.

>> Explaination for the whole table:
>>
>> |   |                     | font-lock | TS sexp |     TS | TS query reuse |
>> | 1 | xdisp.c all at once |    12.886 |   0.031 |  0.016 |          0.017 |
>> | 2 | 20 × 512c           |     0.273 |   0.214 |  0.209 |          0.000 |
>> | 3 | 512c to end         |       4m+ |  24.177 | 23.474 |          0.037 |
>>
>> Rows:
>> - Benchmark 1 xdisp.c all at once: run font-lock-font-lock-fontify-region
>>   on the entire buffer once
>> - Benchmark 2 20 × 512c: fontify the next 512 characters 20 times
>> - Benchmark 2 20 × 512c: fontify the next 512 characters until the
>>   buffer ends
>
> Thanks.  I think these benchmarks are not very useful.  Representative
> benchmarks I can think of are:
>
>   . the time it takes to visit xdisp.c and display the first window-full
>   . visit xdisp.c, then immediately go to its end
>   . C-v in xdisp.c (repeat many times to see how much a single C-v
>     takes)

Okay, we can try that. What's the proper way to trigger a "natural
fontification" as would occur in the GUI without opening an interactive
session? I'd rather use the groundwork that's actually used by users,
and not get stuff like the JIT chunck size wrong. In general I'm not too
familiar with that part of Emacs; the benchmarks up to now used
with-temp-buffer, would that suffice for these new benchmarks?

>> I thought garbage collection could take care of that. Is that
>> problematic?
>
> GC can take care of queries that the Lisp program no longer needs, but
> the Lisp program should first decide that it no longer needs them.
> Like stop referencing them in any data structure.

Is that a problem? If anyone's generating queries and putting them in
lists, that would be a problem whether they're strings or compiled
objects.

 - Yoav



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-12 16:26                           ` Yoav Marco
@ 2022-05-12 17:18                             ` Eli Zaretskii
  2022-05-12 17:22                               ` Yoav Marco
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-12 17:18 UTC (permalink / raw)
  To: Yoav Marco; +Cc: casouri, emacs-devel

> From: Yoav Marco <yoavm448@gmail.com>
> Cc: casouri@gmail.com, emacs-devel@gnu.org
> Date: Thu, 12 May 2022 19:26:50 +0300
> 
> How I understand it, if it takes 23.474s to fontify 2332 times without
> query caching and 0.037s with, then 99.7% of the time is spent in
> recompiling the same query, or (23.474 - 0.037)/2332 = 10ms per
> fontification.

Yes, and 10 ms is negligibly short.  So, while the relative speedup is
very significant, I still don't see any reason for caching the
queries.

But maybe we should make this discussion more concrete.  Can you show
the queries and explain how they are produced from the font-lock rules
(or whatever else they are produced from)?  How many different queries
do we expect to have in a garden-variety major mode for a PL, and what
do they depend on?

> Explaination for the whole table:
> 
> |   |                     | font-lock | TS sexp |     TS | TS query reuse |
> | 1 | xdisp.c all at once |    12.886 |   0.031 |  0.016 |          0.017 |
> | 2 | 20 × 512c           |     0.273 |   0.214 |  0.209 |          0.000 |
> | 3 | 512c to end         |       4m+ |  24.177 | 23.474 |          0.037 |
> 
> Rows:
> - Benchmark 1 xdisp.c all at once: run font-lock-font-lock-fontify-region
>   on the entire buffer once
> - Benchmark 2 20 × 512c: fontify the next 512 characters 20 times
> - Benchmark 2 20 × 512c: fontify the next 512 characters until the
>   buffer ends

Thanks.  I think these benchmarks are not very useful.  Representative
benchmarks I can think of are:

  . the time it takes to visit xdisp.c and display the first window-full
  . visit xdisp.c, then immediately go to its end
  . C-v in xdisp.c (repeat many times to see how much a single C-v
    takes)

> >> >> If we expose "compiled query” we don’t need to cache them either.
> >> >
> >> > Then the Lisp program will have to do that, which is even worse,
> >> > because the problems I described will now have to be solved by Lisp
> >> > application programmers, each time anew.
> >>
> >> Will they? They'd just need to compile their queries once, when defining
> >> them or when setting treesit-font-lock-defaults.
> >
> > And decide when to discard them.
> 
> I thought garbage collection could take care of that. Is that
> problematic?

GC can take care of queries that the Lisp program no longer needs, but
the Lisp program should first decide that it no longer needs them.
Like stop referencing them in any data structure.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-12 16:04                         ` Eli Zaretskii
@ 2022-05-12 16:26                           ` Yoav Marco
  2022-05-12 17:18                             ` Eli Zaretskii
  0 siblings, 1 reply; 187+ messages in thread
From: Yoav Marco @ 2022-05-12 16:26 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: casouri, emacs-devel


Eli Zaretskii <eliz@gnu.org> writes:

>> From: Yoav Marco <yoavm448@gmail.com>
>> Cc: Yuan Fu <casouri@gmail.com>, emacs-devel@gnu.org
>> Date: Thu, 12 May 2022 17:16:41 +0300
>>
>> And it probably is: in my benchmark, query compilation improved
>> performance in much more than 16/6=266%: it went from 6.06 to 0.01.
>
> That was in one of the tests, which, AFAIU, is not very interesting
> for assessing the effect on practical use cases in Emacs usage.  Or
> are you saying that Yuan's explanation of what that test tested was
> incorrect? in that case, please post the correct explanation.

Sorry, I'm saying I'm not sure how he got to the fraction of how much
time it takes to compile a query.

How I understand it, if it takes 23.474s to fontify 2332 times without
query caching and 0.037s with, then 99.7% of the time is spent in
recompiling the same query, or (23.474 - 0.037)/2332 = 10ms per
fontification. Which, uh, is what Yuan said, but I don't know how he
reached the "0.0158s per call to font-lock-region".

>> > According to your benchmarks, it is already very fast: 16 msec is a
>> > negligible time interval.  Of course, 40 is a somewhat arbitrary
>> > number, but to get a less arbitrary one, we should determine it from
>> > some concrete scenarios, such as the 512-character chunk JIT font-lock
>> > uses during redisplay, or the number of lines on a typical window
>> > that's important when one scrolls with C-v/M-v, etc.
>>
>> It's easy enough to convert the benchmarks to 512-chars chunks rather
>> than 40 lines. See table a few paragraphs below.
>
> I'm sorry, I don't understand how to interpret that table.  Can you
> please explain the two last entries in the left column?

Explaination for the whole table:

|   |                     | font-lock | TS sexp |     TS | TS query reuse |
| 1 | xdisp.c all at once |    12.886 |   0.031 |  0.016 |          0.017 |
| 2 | 20 × 512c           |     0.273 |   0.214 |  0.209 |          0.000 |
| 3 | 512c to end         |       4m+ |  24.177 | 23.474 |          0.037 |

Rows:
- Benchmark 1 xdisp.c all at once: run font-lock-font-lock-fontify-region
  on the entire buffer once
- Benchmark 2 20 × 512c: fontify the next 512 characters 20 times
- Benchmark 2 20 × 512c: fontify the next 512 characters until the
  buffer ends

Columns:
- font-lock: fontifying using c-mode's font-lock setup
- TS sexp: using current non-caching treesit, but giving it the query as
  a sexp and not as a string
- TS: current non-caching treesit, but supplying query as string
- TS query reuse: caching compiled query objects using my dumb patch
  that just reuses the last query object as long as the char* for the
  query string doesn't change


>> >> If we expose "compiled query” we don’t need to cache them either.
>> >
>> > Then the Lisp program will have to do that, which is even worse,
>> > because the problems I described will now have to be solved by Lisp
>> > application programmers, each time anew.
>>
>> Will they? They'd just need to compile their queries once, when defining
>> them or when setting treesit-font-lock-defaults.
>
> And decide when to discard them.

I thought garbage collection could take care of that. Is that
problematic?



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-12 14:16                       ` Yoav Marco
@ 2022-05-12 16:04                         ` Eli Zaretskii
  2022-05-12 16:26                           ` Yoav Marco
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-12 16:04 UTC (permalink / raw)
  To: Yoav Marco; +Cc: casouri, emacs-devel

> From: Yoav Marco <yoavm448@gmail.com>
> Cc: Yuan Fu <casouri@gmail.com>, emacs-devel@gnu.org
> Date: Thu, 12 May 2022 17:16:41 +0300
> 
> And it probably is: in my benchmark, query compilation improved
> performance in much more than 16/6=266%: it went from 6.06 to 0.01.

That was in one of the tests, which, AFAIU, is not very interesting
for assessing the effect on practical use cases in Emacs usage.  Or
are you saying that Yuan's explanation of what that test tested was
incorrect? in that case, please post the correct explanation.

> > According to your benchmarks, it is already very fast: 16 msec is a
> > negligible time interval.  Of course, 40 is a somewhat arbitrary
> > number, but to get a less arbitrary one, we should determine it from
> > some concrete scenarios, such as the 512-character chunk JIT font-lock
> > uses during redisplay, or the number of lines on a typical window
> > that's important when one scrolls with C-v/M-v, etc.
> 
> It's easy enough to convert the benchmarks to 512-chars chunks rather
> than 40 lines. See table a few paragraphs below.

I'm sorry, I don't understand how to interpret that table.  Can you
please explain the two last entries in the left column?

> >> If we expose "compiled query” we don’t need to cache them either.
> >
> > Then the Lisp program will have to do that, which is even worse,
> > because the problems I described will now have to be solved by Lisp
> > application programmers, each time anew.
> 
> Will they? They'd just need to compile their queries once, when defining
> them or when setting treesit-font-lock-defaults.

And decide when to discard them.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-12 15:18                         ` Stefan Monnier
@ 2022-05-12 15:53                           ` Eli Zaretskii
  0 siblings, 0 replies; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-12 15:53 UTC (permalink / raw)
  To: Stefan Monnier; +Cc: casouri, yoavm448, emacs-devel

> From: Stefan Monnier <monnier@iro.umontreal.ca>
> Cc: Yuan Fu <casouri@gmail.com>,  yoavm448@gmail.com,  emacs-devel@gnu.org
> Date: Thu, 12 May 2022 11:18:28 -0400
> 
> Eli Zaretskii [2022-05-12 08:19:22] wrote:
> 
> > Btw, how about adding this and other benchmarks to the test suite?  We
> > may wish to use them later, to measure potential speedups or
> > slowdowns.
> 
> Not sure if the test suite is the best place for that.
> The `elisp-benchmarks` GNU ELPA package might be more appropriate.

I prefer to have it handy in core.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-12  5:19                       ` Eli Zaretskii
  2022-05-12  6:10                         ` Yuan Fu
@ 2022-05-12 15:18                         ` Stefan Monnier
  2022-05-12 15:53                           ` Eli Zaretskii
  1 sibling, 1 reply; 187+ messages in thread
From: Stefan Monnier @ 2022-05-12 15:18 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: Yuan Fu, yoavm448, emacs-devel

Eli Zaretskii [2022-05-12 08:19:22] wrote:

>> From: Yuan Fu <casouri@gmail.com>
>> Date: Wed, 11 May 2022 13:25:22 -0700
>> Cc: Yoav Marco <yoavm448@gmail.com>,
>>  emacs-devel@gnu.org
>> 
>> font-lock: 88.28s -> 0.1997285067873303 / loop
>>                      ^^^^^^^^^^^^^^^^^^
>>                 should be 0.09754696132596685     
>
> Still an order of magnitude speedup: from 100 msec to 16 msec.
>
> Btw, how about adding this and other benchmarks to the test suite?  We
> may wish to use them later, to measure potential speedups or
> slowdowns.

Not sure if the test suite is the best place for that.
The `elisp-benchmarks` GNU ELPA package might be more appropriate.


        Stefan




^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-12  5:17                     ` Eli Zaretskii
  2022-05-12  6:07                       ` Yuan Fu
  2022-05-12 14:16                       ` Yoav Marco
@ 2022-05-12 15:15                       ` Stefan Monnier
  2 siblings, 0 replies; 187+ messages in thread
From: Stefan Monnier @ 2022-05-12 15:15 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: Yuan Fu, yoavm448, emacs-devel

> ... some concrete scenarios, such as the 512-character chunk JIT font-lock
> uses during redisplay, ...

Side note: on `master`, jit-lock.el says:

    (defcustom jit-lock-chunk-size 1500


-- Stefan




^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-12  5:17                     ` Eli Zaretskii
  2022-05-12  6:07                       ` Yuan Fu
@ 2022-05-12 14:16                       ` Yoav Marco
  2022-05-12 16:04                         ` Eli Zaretskii
  2022-05-12 15:15                       ` Stefan Monnier
  2 siblings, 1 reply; 187+ messages in thread
From: Yoav Marco @ 2022-05-12 14:16 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: Yuan Fu, emacs-devel

[-- Attachment #1: Type: text/plain, Size: 5405 bytes --]

> Eli Zaretskii <eliz@gnu.org> writes:
>> From: Yuan Fu <casouri@gmail.com>
>> Date: Wed, 11 May 2022 13:14:33 -0700
>> Cc: Yoav Marco <yoavm448@gmail.com>,
>>  emacs-devel@gnu.org
>>
>> I redid the benchmark, but without his reuse patch, just to see how
>> much time is spent on creating query objects. So fortifying 40 lines
>> for 463 times takes 6.92s (according to Emacs, 7.30s according to the
>> profiler). That counts to 0.0158s per call to font-lock-region, of
>> which 0.0104s is spent on creating the query object. That seems to
>> tell me if we optimize away the query object creation we can make
>> font-locking very very fast?

This is a little confusing, which profiler are we talking about? Is the
difference between Emacs's 6.92s and the profiler's 7.30 because Emacs
is only benchmarking the loop, and the profiler's measuring the entire
execution? Query compilation doesn't improve startup time, so the
conclusion that only 10ms is spent on query compilation might be wrong.
And it probably is: in my benchmark, query compilation improved
performance in much more than 16/6=266%: it went from 6.06 to 0.01.

> According to your benchmarks, it is already very fast: 16 msec is a
> negligible time interval.  Of course, 40 is a somewhat arbitrary
> number, but to get a less arbitrary one, we should determine it from
> some concrete scenarios, such as the 512-character chunk JIT font-lock
> uses during redisplay, or the number of lines on a typical window
> that's important when one scrolls with C-v/M-v, etc.

It's easy enough to convert the benchmarks to 512-chars chunks rather
than 40 lines. See table a few paragraphs below.

>> font-lock: 88.28s -> 0.1997285067873303 / loop
>
> So we already have an order-of-magnitude speed-up with tree-sitter: we
> go from 200 msec down to 16 msec.  Also, 200 msec is above the
> threshold of human perception of a response delay, whereas 16 msec is
> way below that threshold.  With such significantly faster font-lock, I
> wouldn't bother caching anything, at least not yet, not unless someone
> comes up with a practical use case where the query-compilation part
> really makes a significant practical difference in terms of absolute
> response times.

> Bottom line: I think the 6-msec speedup (from 16 to 10) in the
> scenario that was used in these benchmarks doesn't justify the
> complexities of caching the queries, given the overall excellent
> performance we get with tree-sitter.  Caching is an optimization, and
> in this case it sounds like doing that now would be a premature
> optimization.

As said, I think 16→10 is a wrong conclusion.

>> If we expose "compiled query” we don’t need to cache them either.
>
> Then the Lisp program will have to do that, which is even worse,
> because the problems I described will now have to be solved by Lisp
> application programmers, each time anew.

Will they? They'd just need to compile their queries once, when defining
them or when setting treesit-font-lock-defaults.

Right now the most convenient way to represent queries is as sexps, but
although treesit accepts queries as lists major-modes are encouraged to
stringify them, since the tree-sitter API works with string queries.
This exact discussion occured when Theodor asked for feedback on the
go-mode.el:

> From: Yuan Fu <casouri@gmail.com>
> Date: Mon, 2022-05-09 21:10 UTC
> To: Eli Zaretskii
>
> I have some comments below, I haven’t tested the patch yet.
>>
>> +(defvar js-treesit-font-lock-settings-1
>> +  '((javascript
>> +     (
>> +      ((identifier) @font-lock-constant-face
>> +       (:match "^[A-Z_][A-Z_\\d]*$" @font-lock-constant-face))
>
> I would use treesit-expand-query to “expand” the sexp query to string,
> so Emacs don’t need to re-expand it every time treesit-query-capture is
> called. I don’t know how much it speed things up, but hey its free.

Why don't we check how much it speeds things up?

|   |                     | font-lock | TS sexp |     TS | TS query reuse |
| 1 | xdisp.c all at once |    12.886 |   0.031 |  0.016 |          0.017 |
| 2 | 20 × 512c           |     0.273 |   0.214 |  0.209 |          0.000 |
| 3 | 512c to end         |       4m+ |  24.177 | 23.474 |          0.037 |

So the time to stringify is negligible compared to query compilation.
Also, I don't know why font lock took that much time in the last
benchmark.

> or the number of lines on a typical window that's important when one
> scrolls with C-v/M-v, etc.
The following calculation sounds a little silly to me, but here it is anyway.

xdisp.c has 32.3 chars per line on average, so each 512 char
fontification covers 15.8 lines. My Emacs window can fit 50 lines, so
when jumping to an unfontified buffer location I'll need 4 calls for
fontification. That would take, depending on the engine:

| font-lock | TS sexp |    TS | TS query reuse |
|     0.054 |   0.042 | 0.041 |           0.00 |
(The 20 × 512c row, divided by 5 to represent 4 × 512c)

Improving fontification by 41ms is worth it in my opinion, as long as
it's not complicated, which it shouldn't be when letting users compile
their queries before use, though I don't know the downsides of exposing
another type to lisp. (Currently tree-sitter adds two new types,
treesit-node and treesit-parser.)

 - Yoav

[-- Attachment #2: tree-sitter-benchmark.el --]
[-- Type: text/plain, Size: 3156 bytes --]

;;; tree-sitter-benchmark.el -*- lexical-binding: t; -*-
;; run benchmark with
;; emacs -Q --script tree-sitter-benchmark.el [-1] [-2] [-3] [-regexp]

(require 'treesit)
(require 'cc-mode)

(defvar query-type 'list
  "How to save the query. Either `string' or `list'.")
(defcustom fontifying-mode 'treesit
  "Benchmark mode."
  :type '(choice (const treesit)
                 (const regexp)))
(setq fontifying-mode 'regexp)


(defvar c-font-lock-settings-1
  `((c
     ,(with-temp-buffer
        (insert-file-contents-literally "./highlights.scm")
        ;; make capture names map to a face, any face
        (goto-char (point-min))
        (while (re-search-forward "@[a-z.]+" nil t)
          (replace-match "@font-lock-string-face" t))
        (pcase query-type
          ('string
           (buffer-substring (point-min) (point-max)))
          ('list
           (goto-char (point-min))
           (insert "(")
           (goto-char (point-max))
           (insert ")")
           (goto-char (point-min))
           (read (current-buffer)))
          (_ (user-error "`query-type' must be 'string or 'list")))))))

(defun setup-fontification ()
  (pcase fontifying-mode
    ('treesit
     (treesit-get-parser-create 'c)
     ;; This needs to be non-nil, because reasons
     (unless font-lock-defaults
       (setq font-lock-defaults '(nil t)))
     (setq-local treesit-font-lock-defaults
                 '((c-font-lock-settings-1)))
     (treesit-font-lock-enable)
     (advice-add #'font-lock-default-fontify-region :override #'ignore))
    ('regexp
     (c-mode))))

(defun fontify (beg end)
  (pcase fontifying-mode
    ('treesit (font-lock-fontify-region beg end))
    ('regexp (font-lock-fontify-region beg end nil))))

(defun buffer-middle ()
  (/ (+ (point-min) (point-max)) 2))

(with-temp-buffer
  (message "Fontification method: %s %s" fontifying-mode query-type)
  (setup-fontification)
  (insert-file-contents "xdisp.c")
  (apply #'message
         "Benchmark 1: fontify xdisp.c all at once.\
 took %2.3f, with %d gc runs (meaning %2.3f)"
         (benchmark-run 1
           (fontify (point-min) (point-max))))

  (set-text-properties (point-min) (point-max) nil)

  ;; fontify xdisp.c from the middle, since it starts with a comment header of
  ;; 22k chars
  (goto-char (buffer-middle))
  (apply #'message
         "Benchmark 2: fontify part of xdisp.c, 20 batches of 512 chars.\
 took %2.3f, with %d gc runs (meaning %2.3f)"
         (benchmark-run 1
           (dotimes (_ 20)
             (fontify (point) (min (+ 512 (point)) (point-max)))
             (forward-char 512))))

  (kill-new (buffer-substring (buffer-middle) (+ (* 512 10) (buffer-middle))))

  (set-text-properties (point-min) (point-max) nil)

  (goto-char (point-min))
  (apply #'message
         "Benchmark 3: fontify all of xdisp.c, 512 chars at a time.\
 took %2.3f, with %d gc runs (meaning %2.3f)"
         (benchmark-run 1
           (while (/= (point-max) (point))
             (fontify (point) (min (+ 512 (point)) (point-max)))
             (goto-char (min (+ 512 (point)) (point-max))))))



  (advice-remove #'font-lock-default-fontify-region #'ignore))

^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-12  6:10                         ` Yuan Fu
@ 2022-05-12  7:12                           ` Eli Zaretskii
  0 siblings, 0 replies; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-12  7:12 UTC (permalink / raw)
  To: Yuan Fu; +Cc: yoavm448, emacs-devel

> From: Yuan Fu <casouri@gmail.com>
> Date: Wed, 11 May 2022 23:10:40 -0700
> Cc: yoavm448@gmail.com,
>  emacs-devel@gnu.org
> 
> > Btw, how about adding this and other benchmarks to the test suite?  We
> > may wish to use them later, to measure potential speedups or
> > slowdowns.
> 
> Sure, could you give me some pointers to adding a timed test suite? I don’t find anything like that in ERT, except :expensive-test.

Just put it in test/manual/, and add some minimal instructions to the
.el file(s) explaining how to run the benchmarks.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-12  5:19                       ` Eli Zaretskii
@ 2022-05-12  6:10                         ` Yuan Fu
  2022-05-12  7:12                           ` Eli Zaretskii
  2022-05-12 15:18                         ` Stefan Monnier
  1 sibling, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-05-12  6:10 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: yoavm448, emacs-devel



> On May 11, 2022, at 10:19 PM, Eli Zaretskii <eliz@gnu.org> wrote:
> 
>> From: Yuan Fu <casouri@gmail.com>
>> Date: Wed, 11 May 2022 13:25:22 -0700
>> Cc: Yoav Marco <yoavm448@gmail.com>,
>> emacs-devel@gnu.org
>> 
>> font-lock: 88.28s -> 0.1997285067873303 / loop
>>                     ^^^^^^^^^^^^^^^^^^
>>                should be 0.09754696132596685     
> 
> Still an order of magnitude speedup: from 100 msec to 16 msec.
> 
> Btw, how about adding this and other benchmarks to the test suite?  We
> may wish to use them later, to measure potential speedups or
> slowdowns.

Sure, could you give me some pointers to adding a timed test suite? I don’t find anything like that in ERT, except :expensive-test.

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-12  5:17                     ` Eli Zaretskii
@ 2022-05-12  6:07                       ` Yuan Fu
  2022-05-12 14:16                       ` Yoav Marco
  2022-05-12 15:15                       ` Stefan Monnier
  2 siblings, 0 replies; 187+ messages in thread
From: Yuan Fu @ 2022-05-12  6:07 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: Yoav Marco, emacs-devel



> On May 11, 2022, at 10:17 PM, Eli Zaretskii <eliz@gnu.org> wrote:
> 
>> From: Yuan Fu <casouri@gmail.com>
>> Date: Wed, 11 May 2022 13:14:33 -0700
>> Cc: Yoav Marco <yoavm448@gmail.com>,
>> emacs-devel@gnu.org
>> 
>>> |   |                                      | no reuse (now) | reuse |
>>> | 1 | Fontify xdisp.c all at once          |          0.01s | 0.01s |
>>> | 2 | Fontify 60 next lines of xdisp.c ×10 |          0.10s | 0.00s |
>>> | 3 | Fontify 60 next lines till the end   |          6.06s | 0.01s |
>>> 
>>> If so, what is the significance of the last line in practical use
>>> cases?  JIT font-lock never fontifies such large chunks of source
>>> code, it does that in 512-character chunks, which is less than 60
>>> lines in most cases, and definitely not "till the end".
>> 
>> I think that’s just a way to run font-lock enough times without repeatedly fontifying the same region?
> 
> Then I'm not sure the result is very interesting by itself, unless we
> can find a way to use that result for estimating how long will it take
> to perform fontifications in some practical use cases that we care
> about, and compare that to what we have now in those use cases.
> 
>> I redid the benchmark, but without his reuse patch, just to see how much time is spent on creating query objects. So fortifying 40 lines for 463 times takes 6.92s (according to Emacs, 7.30s according to the profiler). That counts to 0.0158s per call to font-lock-region, of which 0.0104s is spent on creating the query object. That seems to tell me if we optimize away the query object creation we can make font-locking very very fast?
> 
> According to your benchmarks, it is already very fast: 16 msec is a
> negligible time interval.  Of course, 40 is a somewhat arbitrary
> number, but to get a less arbitrary one, we should determine it from
> some concrete scenarios, such as the 512-character chunk JIT font-lock
> uses during redisplay, or the number of lines on a typical window
> that's important when one scrolls with C-v/M-v, etc.
> 
>> If we expose "compiled query” we don’t need to cache them either.
> 
> Then the Lisp program will have to do that, which is even worse,
> because the problems I described will now have to be solved by Lisp
> application programmers, each time anew.
> 
>> Benchmark 3: fontify all of xdisp.c, 40 lines at a time.
>> took 88.28, of which 5.00 is GC (4 gc runs), loop count: 905
>> 
>> font-lock: 88.28s -> 0.1997285067873303 / loop
> 
> So we already have an order-of-magnitude speed-up with tree-sitter: we
> go from 200 msec down to 16 msec.  Also, 200 msec is above the
> threshold of human perception of a response delay, whereas 16 msec is
> way below that threshold.  With such significantly faster font-lock, I
> wouldn't bother caching anything, at least not yet, not unless someone
> comes up with a practical use case where the query-compilation part
> really makes a significant practical difference in terms of absolute
> response times.
> 
> Bottom line: I think the 6-msec speedup (from 16 to 10) in the
> scenario that was used in these benchmarks doesn't justify the
> complexities of caching the queries, given the overall excellent
> performance we get with tree-sitter.  Caching is an optimization, and
> in this case it sounds like doing that now would be a premature
> optimization.

Sure, that makes sense, and I save writing code ;-) If we want it later we can easily add that without breaking any API.

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-11 20:25                     ` Yuan Fu
@ 2022-05-12  5:19                       ` Eli Zaretskii
  2022-05-12  6:10                         ` Yuan Fu
  2022-05-12 15:18                         ` Stefan Monnier
  0 siblings, 2 replies; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-12  5:19 UTC (permalink / raw)
  To: Yuan Fu; +Cc: yoavm448, emacs-devel

> From: Yuan Fu <casouri@gmail.com>
> Date: Wed, 11 May 2022 13:25:22 -0700
> Cc: Yoav Marco <yoavm448@gmail.com>,
>  emacs-devel@gnu.org
> 
> font-lock: 88.28s -> 0.1997285067873303 / loop
>                      ^^^^^^^^^^^^^^^^^^
>                 should be 0.09754696132596685     

Still an order of magnitude speedup: from 100 msec to 16 msec.

Btw, how about adding this and other benchmarks to the test suite?  We
may wish to use them later, to measure potential speedups or
slowdowns.

Thanks.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-11 20:14                   ` Yuan Fu
  2022-05-11 20:25                     ` Yuan Fu
@ 2022-05-12  5:17                     ` Eli Zaretskii
  2022-05-12  6:07                       ` Yuan Fu
                                         ` (2 more replies)
  1 sibling, 3 replies; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-12  5:17 UTC (permalink / raw)
  To: Yuan Fu; +Cc: yoavm448, emacs-devel

> From: Yuan Fu <casouri@gmail.com>
> Date: Wed, 11 May 2022 13:14:33 -0700
> Cc: Yoav Marco <yoavm448@gmail.com>,
>  emacs-devel@gnu.org
> 
> >  |   |                                      | no reuse (now) | reuse |
> >  | 1 | Fontify xdisp.c all at once          |          0.01s | 0.01s |
> >  | 2 | Fontify 60 next lines of xdisp.c ×10 |          0.10s | 0.00s |
> >  | 3 | Fontify 60 next lines till the end   |          6.06s | 0.01s |
> > 
> > If so, what is the significance of the last line in practical use
> > cases?  JIT font-lock never fontifies such large chunks of source
> > code, it does that in 512-character chunks, which is less than 60
> > lines in most cases, and definitely not "till the end".
> 
> I think that’s just a way to run font-lock enough times without repeatedly fontifying the same region?

Then I'm not sure the result is very interesting by itself, unless we
can find a way to use that result for estimating how long will it take
to perform fontifications in some practical use cases that we care
about, and compare that to what we have now in those use cases.

> I redid the benchmark, but without his reuse patch, just to see how much time is spent on creating query objects. So fortifying 40 lines for 463 times takes 6.92s (according to Emacs, 7.30s according to the profiler). That counts to 0.0158s per call to font-lock-region, of which 0.0104s is spent on creating the query object. That seems to tell me if we optimize away the query object creation we can make font-locking very very fast?

According to your benchmarks, it is already very fast: 16 msec is a
negligible time interval.  Of course, 40 is a somewhat arbitrary
number, but to get a less arbitrary one, we should determine it from
some concrete scenarios, such as the 512-character chunk JIT font-lock
uses during redisplay, or the number of lines on a typical window
that's important when one scrolls with C-v/M-v, etc.

> If we expose "compiled query” we don’t need to cache them either.

Then the Lisp program will have to do that, which is even worse,
because the problems I described will now have to be solved by Lisp
application programmers, each time anew.

> Benchmark 3: fontify all of xdisp.c, 40 lines at a time.
> took 88.28, of which 5.00 is GC (4 gc runs), loop count: 905
> 
> font-lock: 88.28s -> 0.1997285067873303 / loop

So we already have an order-of-magnitude speed-up with tree-sitter: we
go from 200 msec down to 16 msec.  Also, 200 msec is above the
threshold of human perception of a response delay, whereas 16 msec is
way below that threshold.  With such significantly faster font-lock, I
wouldn't bother caching anything, at least not yet, not unless someone
comes up with a practical use case where the query-compilation part
really makes a significant practical difference in terms of absolute
response times.

Bottom line: I think the 6-msec speedup (from 16 to 10) in the
scenario that was used in these benchmarks doesn't justify the
complexities of caching the queries, given the overall excellent
performance we get with tree-sitter.  Caching is an optimization, and
in this case it sounds like doing that now would be a premature
optimization.

Thanks.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-11 20:14                   ` Yuan Fu
@ 2022-05-11 20:25                     ` Yuan Fu
  2022-05-12  5:19                       ` Eli Zaretskii
  2022-05-12  5:17                     ` Eli Zaretskii
  1 sibling, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-05-11 20:25 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: Yoav Marco, emacs-devel

font-lock: 88.28s -> 0.1997285067873303 / loop
                     ^^^^^^^^^^^^^^^^^^
                should be 0.09754696132596685     

Yuan



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-11 16:27                 ` Eli Zaretskii
@ 2022-05-11 20:14                   ` Yuan Fu
  2022-05-11 20:25                     ` Yuan Fu
  2022-05-12  5:17                     ` Eli Zaretskii
  0 siblings, 2 replies; 187+ messages in thread
From: Yuan Fu @ 2022-05-11 20:14 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: Yoav Marco, emacs-devel

[-- Attachment #1: Type: text/plain, Size: 2354 bytes --]

> 
> And the timings are in the table below?
> 
>  |   |                                      | no reuse (now) | reuse |
>  | 1 | Fontify xdisp.c all at once          |          0.01s | 0.01s |
>  | 2 | Fontify 60 next lines of xdisp.c ×10 |          0.10s | 0.00s |
>  | 3 | Fontify 60 next lines till the end   |          6.06s | 0.01s |
> 
> If so, what is the significance of the last line in practical use
> cases?  JIT font-lock never fontifies such large chunks of source
> code, it does that in 512-character chunks, which is less than 60
> lines in most cases, and definitely not "till the end".

I think that’s just a way to run font-lock enough times without repeatedly fontifying the same region?

> 
> Also, how much time does it take to do the same with the current
> regexp- and syntax-based font-lock, for the same chunks of text?
> 
> We need to examine the use cases and the absolute numbers carefully
> before we conclude that any kind of caching is needed and/or
> justified.
> 

I redid the benchmark, but without his reuse patch, just to see how much time is spent on creating query objects. So fortifying 40 lines for 463 times takes 6.92s (according to Emacs, 7.30s according to the profiler). That counts to 0.0158s per call to font-lock-region, of which 0.0104s is spent on creating the query object. That seems to tell me if we optimize away the query object creation we can make font-locking very very fast? And not just font-locking, since using tree-sitter to do anything useful basically means querying the parsed tree.

If we expose "compiled query” we don’t need to cache them either.

The regex-based font-lock is a lot slower. With the optimization or not tree-sitter is a win, but we know that already. I have no idea why regex font-lock ran for 905 loops comparing to 463 for tree-sitter. Maybe I did something wrong there.

Benchmark 3: fontify all of xdisp.c, 40 lines at a time.
took 6.92, of which 1.00 is GC (0 gc runs), loop count: 463

font-lock:    7.30s -> 0.015766738660907127 / loop
ts_query_new: 4.80s -> 0.010367170626349892s / loop

Note: 7.30 is taken from external profiler.

Benchmark 3: fontify all of xdisp.c, 40 lines at a time.
took 88.28, of which 5.00 is GC (4 gc runs), loop count: 905

font-lock: 88.28s -> 0.1997285067873303 / loop

Yuan


[-- Attachment #2: tree-sitter-benchmark.el --]
[-- Type: application/octet-stream, Size: 1673 bytes --]

;;; tree-sitter-benchmark.el -*- lexical-binding: t; -*-

(require 'treesit)
(setq c-font-lock-settings-1
      `((c
         ,(with-temp-buffer
            (insert-file-contents-literally "./highlights.scm")
            ;; make capture names map to a face, any face
            (goto-char (point-min))
            (while (re-search-forward "@[a-z.]+" nil t)
              (replace-match "@font-lock-string-face" t))
            (buffer-substring (point-min) (point-max))))))

(with-temp-buffer
  (treesit-get-parser-create 'c)
  (setq-local treesit-font-lock-defaults
              '((c-font-lock-settings-1)))
  (font-lock-mode)
  (treesit-font-lock-enable)
  (insert-file-contents "xdisp.c")
  (let ((count 0))
    (apply #'message
           "Benchmark 3: fontify all of xdisp.c, 40 lines at a time.\
  took %2.2f, of which %2.2f is GC (%d gc runs), loop count: %s"
           (append
            (benchmark-run 1
              (while (/= (point-max) (point))
                (font-lock-fontify-region (point) (line-end-position 40))
                (forward-line 40)
                (cl-incf count)))
            (list count)))))

(with-temp-buffer
  (treesit-get-parser-create 'c)
  (c-mode)
  (insert-file-contents "xdisp.c")
  (let ((count 0))
    (apply #'message
           "Benchmark 3: fontify all of xdisp.c, 40 lines at a time.\
  took %2.2f, of which %2.2f is GC (%d gc runs), loop count: %s"
           (append
            (benchmark-run 1
              (while (/= (point-max) (point))
                (font-lock-fontify-region (point) (line-end-position 40))
                (forward-line 40)
                (cl-incf count)))
            (list count)))))

[-- Attachment #3: highlights.scm --]
[-- Type: application/octet-stream, Size: 3299 bytes --]

;; Copied from elisp-tree-sitter/langs/queries/c
["break"
 "case"
 "const"
 "continue"
 "default"
 "do"
 "else"
 "enum"
 "extern"
 "for"
 "if"
 "inline"
 "return"
 "sizeof"
 "static"
 "struct"
 "switch"
 "typedef"
 "union"
 "volatile"
 "while"
 "..."] @keyword

[(storage_class_specifier)
 (type_qualifier)] @keyword

["#define"
 "#else"
 "#endif"
 "#if"
 "#ifdef"
 "#ifndef"
 "#include"
 (preproc_directive)] @function.macro

((["#ifdef" "#ifndef"] (identifier) @constant))

["+" "-" "*" "/" "%"
 "~" "|" "&" "<<" ">>"
 "!" "||" "&&"
 "->"
 "==" "!=" "<" ">" "<=" ">="
 "=" "+=" "-=" "*=" "/=" "%=" "|=" "&="
 "++" "--"
] @operator

(conditional_expression ["?" ":"] @operator)

["(" ")" "[" "]" "{" "}"] @punctuation.bracket

["." "," ";"] @punctuation.delimiter

;;; ----------------------------------------------------------------------------
;;; Functions.

(call_expression
 function: [(identifier) @function.call
            (field_expression field: (_) @method.call)])

(function_declarator
 declarator: [(identifier) @function
              (parenthesized_declarator
               (pointer_declarator (field_identifier) @function))])

(preproc_function_def
 name: (identifier) @function)

;;; ----------------------------------------------------------------------------
;;; Types.

[(primitive_type)
 (sized_type_specifier)] @type.builtin

(type_identifier) @type

;;; ----------------------------------------------------------------------------
;;; Variables.

(declaration declarator: [(identifier) @variable
                          (_ (identifier) @variable)])

(parameter_declaration declarator: [(identifier) @variable.parameter
                                    (_ (identifier) @variable.parameter)])

(init_declarator declarator: [(identifier) @variable
                              (_ (identifier) @variable)])

(assignment_expression
 left: [(identifier) @variable
        (field_expression field: (_) @variable)
        (subscript_expression argument: (identifier) @variable)
        (pointer_expression (identifier) @variable)])

(update_expression
 argument: (identifier) @variable)

(preproc_def name: (identifier) @variable.special)

(preproc_params
 (identifier) @variable.parameter)

;;; ----------------------------------------------------------------------------
;;; Properties.

(field_declaration
 declarator: [(field_identifier) @property.definition
              (pointer_declarator (field_identifier) @property.definition)
              (pointer_declarator (pointer_declarator (field_identifier) @property.definition))])

(enumerator name: (identifier) @property.definition)

(field_identifier) @property

;;; ----------------------------------------------------------------------------
;;; Misc.

;; Doesn't work right now: results in error Query pattern is malformed: "Cannot
;; find captured node", "^[A-Z_][A-Z_\\d]*$", "A predicate can only refer to
;; captured nodes in the same pattern"
;; ((identifier) @constant
;;  (.match @constant "^[A-Z_][A-Z_\\d]*$"))

[(null) (true) (false)] @constant.builtin

[(number_literal)
 (char_literal)] @number

(statement_identifier) @label

;;; ----------------------------------------------------------------------------
;;; Strings and comments.

(comment) @comment

[(string_literal)
 (system_lib_string)] @string

[-- Attachment #4: Type: text/plain, Size: 2 bytes --]




^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-11 15:40               ` Yoav Marco
@ 2022-05-11 16:27                 ` Eli Zaretskii
  2022-05-11 20:14                   ` Yuan Fu
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-11 16:27 UTC (permalink / raw)
  To: Yoav Marco; +Cc: casouri, emacs-devel

> From: Yoav Marco <yoavm448@gmail.com>
> Cc: casouri@gmail.com, emacs-devel@gnu.org
> Date: Wed, 11 May 2022 18:40:24 +0300
> 
> > So let's start with the benchmarks, and please tell what exactly did
> > Emacs do to trigger fontifications in each benchmark.
> 
> I called treesit-font-lock-fontify-region, which is the main function
> used for syntax highlighting in treesit as far as I'm aware.
> It's the value of font-lock-fontify-region-function after calling
> treesit-font-lock-enable.
> 
> (The code's attached in the original mail)

And the timings are in the table below?

  |   |                                      | no reuse (now) | reuse |
  | 1 | Fontify xdisp.c all at once          |          0.01s | 0.01s |
  | 2 | Fontify 60 next lines of xdisp.c ×10 |          0.10s | 0.00s |
  | 3 | Fontify 60 next lines till the end   |          6.06s | 0.01s |

If so, what is the significance of the last line in practical use
cases?  JIT font-lock never fontifies such large chunks of source
code, it does that in 512-character chunks, which is less than 60
lines in most cases, and definitely not "till the end".

Also, how much time does it take to do the same with the current
regexp- and syntax-based font-lock, for the same chunks of text?

We need to examine the use cases and the absolute numbers carefully
before we conclude that any kind of caching is needed and/or
justified.

Thanks.

P.S. If the above table is not the relevant benchmarks, please show
the URL of the message in the archive where you posted the relevant
benchmarks.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-11 14:20             ` Eli Zaretskii
@ 2022-05-11 15:40               ` Yoav Marco
  2022-05-11 16:27                 ` Eli Zaretskii
  0 siblings, 1 reply; 187+ messages in thread
From: Yoav Marco @ 2022-05-11 15:40 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: casouri, emacs-devel

> So let's start with the benchmarks, and please tell what exactly did
> Emacs do to trigger fontifications in each benchmark.

I called treesit-font-lock-fontify-region, which is the main function
used for syntax highlighting in treesit as far as I'm aware.
It's the value of font-lock-fontify-region-function after calling
treesit-font-lock-enable.

(The code's attached in the original mail)



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-11 11:16           ` Yoav Marco
@ 2022-05-11 14:20             ` Eli Zaretskii
  2022-05-11 15:40               ` Yoav Marco
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-11 14:20 UTC (permalink / raw)
  To: Yoav Marco; +Cc: casouri, emacs-devel

> From: Yoav Marco <yoavm448@gmail.com>
> Cc: casouri@gmail.com, emacs-devel@gnu.org
> Date: Wed, 11 May 2022 14:16:03 +0300
> 
> Tree sitter parsers make a parse tree. To extract syntax highlighting
> from the tree you run a query that matches tree structures returns
> annotated nodes.
> 
> Queries need to be compiled to run them. Right now queries are kept as
> elisp strings and compiled each time we want to run them.
> 
> Compiling on each run is fine if we run the query extracting syntax
> highlighting information once, but to highlight a buffer in parts
> on-demand means running (thus compiling) the query every time on
> different ranges. When editing buffer the query is re-run on each change
> too.
> 
> I made a benchmark that tested whether reusing the compiled query would
> make anything faster, and turns out it's the biggest bottleneck in
> treesit-font-lock-fontify-region.

How much time does this bottleneck take us?  (I've seen the benchmarks
you posted, but I don't think I understood them, in terms of what
exactly was fontified and due to which command.)

Keeping a cache related to a buffer is a nuisance, because you need to
manage it: track its state and detect when it's invalid etc.  So if
the slowdown is not large enough in absolute (not relative!) terms, my
advice would be to just suck it up.

So let's start with the benchmarks, and please tell what exactly did
Emacs do to trigger fontifications in each benchmark.

Thanks.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-11 11:10         ` Eli Zaretskii
@ 2022-05-11 11:16           ` Yoav Marco
  2022-05-11 14:20             ` Eli Zaretskii
  0 siblings, 1 reply; 187+ messages in thread
From: Yoav Marco @ 2022-05-11 11:16 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: casouri, emacs-devel


> I admit that I don't have a clear idea of the issues that bother you.
> Can you post a summary?

Tree sitter parsers make a parse tree. To extract syntax highlighting
from the tree you run a query that matches tree structures returns
annotated nodes.

Queries need to be compiled to run them. Right now queries are kept as
elisp strings and compiled each time we want to run them.

Compiling on each run is fine if we run the query extracting syntax
highlighting information once, but to highlight a buffer in parts
on-demand means running (thus compiling) the query every time on
different ranges. When editing buffer the query is re-run on each change
too.

I made a benchmark that tested whether reusing the compiled query would
make anything faster, and turns out it's the biggest bottleneck in
treesit-font-lock-fontify-region.

The question right now is how to reuse queries. We could make a cache
like the one for compiled regexps in search.c, or
> On May 10, 2022, at 23:53 UTC, Yuan Fu <casouri@gmail.com> wrote:
>
> just expose query object, and let user store them in lisp. Is there
> any downsides of exposing another type to lisp? Currently tree-sitter
> adds two new types: treesit-node and treesit-parser.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-10 18:18       ` Yoav Marco
  2022-05-10 19:58         ` Stefan Monnier
@ 2022-05-11 11:10         ` Eli Zaretskii
  2022-05-11 11:16           ` Yoav Marco
  1 sibling, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-11 11:10 UTC (permalink / raw)
  To: Yoav Marco; +Cc: casouri, emacs-devel

> From: Yoav Marco <yoavm448@gmail.com>
> Cc: emacs-devel@gnu.org
> Date: Tue, 10 May 2022 21:18:25 +0300
> 
> Eli/Stefan, please guide us!

I admit that I don't have a clear idea of the issues that bother you.
Can you post a summary?



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-10 23:11           ` Yuan Fu
@ 2022-05-10 23:53             ` Yuan Fu
  0 siblings, 0 replies; 187+ messages in thread
From: Yuan Fu @ 2022-05-10 23:53 UTC (permalink / raw)
  To: Stefan Monnier; +Cc: Yoav Marco, emacs-devel



> On May 10, 2022, at 4:11 PM, Yuan Fu <casouri@gmail.com> wrote:
> 
> 
> 
>> On May 10, 2022, at 12:58 PM, Stefan Monnier <monnier@iro.umontreal.ca> wrote:
>> 
>>> Eli/Stefan, please guide us!
>> 
>> I don't have much to say. OK, let's invent something:
>> 
>> 1- Maybe it would be good to expose "compiled query" objects to ELisp,
>> so the compilation could be performed by explicit requests (so you
>> don't have to cache it, instead the major modes would call the
>> function when installing their indentation/fontlock rules).
> 
> Maybe something like (treesit-cache-query key query), that saves exposing another type to lisp.
> 
> 
>> 
>> 2- I don't understand why the use of hash map is perceived as complex.
>> It seems like a fairly simple solution.
> 
> I’m not sure how would we garbage collect unused queries, maybe I’m missing something. And is there a C hash table that we can use? Or we need to use the lisp hash table? 
> 
> Maybe we can use a buffer-local treesit--query-cache hash table and store USERPTR of TSQuery in it, so we don’t worry about garbage collecting. Would that be slow (getting the buffer-local variable, do a lookup, takeout the TSQuery object)?

Or just expose query object, and let user store them in lisp. Is there any downsides of exposing another type to lisp? Currently tree-sitter adds two new types: treesit-node and treesit-parser.

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-10 19:58         ` Stefan Monnier
@ 2022-05-10 23:11           ` Yuan Fu
  2022-05-10 23:53             ` Yuan Fu
  0 siblings, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-05-10 23:11 UTC (permalink / raw)
  To: Stefan Monnier; +Cc: Yoav Marco, emacs-devel



> On May 10, 2022, at 12:58 PM, Stefan Monnier <monnier@iro.umontreal.ca> wrote:
> 
>> Eli/Stefan, please guide us!
> 
> I don't have much to say.  OK, let's invent something:
> 
> 1- Maybe it would be good to expose "compiled query" objects to ELisp,
>   so the compilation could be performed by explicit requests (so you
>   don't have to cache it, instead the major modes would call the
>   function when installing their indentation/fontlock rules).

Maybe something like (treesit-cache-query key query), that saves exposing another type to lisp.


> 
> 2- I don't understand why the use of hash map is perceived as complex.
>   It seems like a fairly simple solution.

I’m not sure how would we garbage collect unused queries, maybe I’m missing something. And is there a C hash table that we can use? Or we need to use the lisp hash table? 

Maybe we can use a buffer-local treesit--query-cache hash table and store USERPTR of TSQuery in it, so we don’t worry about garbage collecting. Would that be slow (getting the buffer-local variable, do a lookup, takeout the TSQuery object)?

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-10 18:18       ` Yoav Marco
@ 2022-05-10 19:58         ` Stefan Monnier
  2022-05-10 23:11           ` Yuan Fu
  2022-05-11 11:10         ` Eli Zaretskii
  1 sibling, 1 reply; 187+ messages in thread
From: Stefan Monnier @ 2022-05-10 19:58 UTC (permalink / raw)
  To: Yoav Marco; +Cc: Yuan Fu, emacs-devel

> Eli/Stefan, please guide us!

I don't have much to say.  OK, let's invent something:

1- Maybe it would be good to expose "compiled query" objects to ELisp,
   so the compilation could be performed by explicit requests (so you
   don't have to cache it, instead the major modes would call the
   function when installing their indentation/fontlock rules).

2- I don't understand why the use of hash map is perceived as complex.
   It seems like a fairly simple solution.


        Stefan




^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-10 17:54     ` Yuan Fu
@ 2022-05-10 18:18       ` Yoav Marco
  2022-05-10 19:58         ` Stefan Monnier
  2022-05-11 11:10         ` Eli Zaretskii
  2022-05-15 19:20       ` chad
  1 sibling, 2 replies; 187+ messages in thread
From: Yoav Marco @ 2022-05-10 18:18 UTC (permalink / raw)
  To: Yuan Fu; +Cc: emacs-devel

[-- Attachment #1: Type: text/plain, Size: 1577 bytes --]


> But could you maybe run the benchmark under gprof and see what you get? Just curious.

I did, I just don't understand gprof's output very well. I've attached
two gmon.out and perf.data files if you have anything specific in mind
to try.

>> So, is caching worth it? I don't know. It definetily is if it's possible
>> to do it internally without introducing a new object type. But I don't
>> think that's possible without making a hash map or a complicated cache
>> like the one for compiled regexps that compile_pattern uses in
>> search.c.
>
> Yeah using a single cache would probably result in a lot of misses since Emacs
> don’t fontify the whole buffer at once. We don’t necessarily need to use a hash
> map. I had a look at search.c and IIUC it uses an Emacs-wide array of 20 regex
> caches and links them into a linked list sorted by most-recently used, which
> doesn’t seem too bad? I think I can do something similar to that. Tho we might
> also want to allow users to pin some “persistent” cache, for example major mode
> font-locking and indent queries, as they are guaranteed to be reused a lot and
> are generally large (ie, slow to create). Maybe that’s unnecessary tho. And I
> wonder if there is a cheap & easy way to do caching buffer-locally…
>
> Or maybe add an argument to query-capture that allow the user to specify whether
> they want the query to be cached, or assume user wants the query to be cached if
> the query is in string form rather than in sexp form.

Eli/Stefan, please guide us!


  Yoav




[-- Attachment #2: profiling-data.tar.gz --]
[-- Type: application/gzip, Size: 444148 bytes --]

^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-10 15:43   ` Yoav Marco
@ 2022-05-10 17:54     ` Yuan Fu
  2022-05-10 18:18       ` Yoav Marco
  2022-05-15 19:20       ` chad
  0 siblings, 2 replies; 187+ messages in thread
From: Yuan Fu @ 2022-05-10 17:54 UTC (permalink / raw)
  To: Yoav Marco; +Cc: emacs-devel



> On May 10, 2022, at 8:43 AM, Yoav Marco <yoavm448@gmail.com> wrote:
> 
> I benchmarked query compilation reuse:
> 
> |   |                                      | no reuse (now) | reuse |
> | 1 | Fontify xdisp.c all at once          |          0.01s | 0.01s |
> | 2 | Fontify 60 next lines of xdisp.c ×10 |          0.10s | 0.00s |
> | 3 | Fontify 60 next lines till the end   |          6.06s | 0.01s |
> 
> 
> The patch to reuse the query is pretty dumb: if the char* for the query
> string didn't change from last time, it reuses the TSQuery object from
> last time instead of calling ts_new_query again. The patch is attached.
> 
> The elisp code for the benchmarks is also attached, but I'll give a
> summary here:
> 
> The queries are tree-sitter-langs' highlights.scm for C.
> 
> Benchmark 1 runs treesit-font-lock-fontify-region once on the entire
> buffer, meaning the query is compiled only once in both cases
> 
> Benchmark 2 runs treesit-font-lock-fontify-region on blocks of 60 lines,
> meaning the no reuse version has to compile the query 10 times even
> though nothing changes in the buffer or query.
> 
> Benchmark 3 is just 2 done all the way. xdisp.c has 36k lines, so the
> 6.06s is consistent
> (600 lines = 0.10s, multiply by 60 ⇒ 36k lines ~= 6.00s).
> 

I had a look and it’s a pretty sensible benchmark, and creating the query object taking a lot of time makes sense. But could you maybe run the benchmark under gprof and see what you get? Just curious.

> So, is caching worth it? I don't know. It definetily is if it's possible
> to do it internally without introducing a new object type. But I don't
> think that's possible without making a hash map or a complicated cache
> like the one for compiled regexps that compile_pattern uses in
> search.c.

Yeah using a single cache would probably result in a lot of misses since Emacs don’t fontify the whole buffer at once. We don’t necessarily need to use a hash map. I had a look at search.c and IIUC it uses an Emacs-wide array of 20 regex caches and links them into a linked list sorted by most-recently used, which doesn’t seem too bad? I think I can do something similar to that. Tho we might also want to allow users to pin some “persistent” cache, for example major mode font-locking and indent queries, as they are guaranteed to be reused a lot and are generally large (ie, slow to create). Maybe that’s unnecessary tho. And I wonder if there is a cheap & easy way to do caching buffer-locally…

Or maybe add an argument to query-capture that allow the user to specify whether they want the query to be cached, or assume user wants the query to be cached if the query is in string form rather than in sexp form.

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-09 20:51 ` Yuan Fu
       [not found]   ` <87lev9wyll.fsf@gmail.com>
@ 2022-05-10 15:43   ` Yoav Marco
  2022-05-10 17:54     ` Yuan Fu
  1 sibling, 1 reply; 187+ messages in thread
From: Yoav Marco @ 2022-05-10 15:43 UTC (permalink / raw)
  To: Yuan Fu; +Cc: emacs-devel

[-- Attachment #1: Type: text/plain, Size: 1488 bytes --]

I benchmarked query compilation reuse:

|   |                                      | no reuse (now) | reuse |
| 1 | Fontify xdisp.c all at once          |          0.01s | 0.01s |
| 2 | Fontify 60 next lines of xdisp.c ×10 |          0.10s | 0.00s |
| 3 | Fontify 60 next lines till the end   |          6.06s | 0.01s |


The patch to reuse the query is pretty dumb: if the char* for the query
string didn't change from last time, it reuses the TSQuery object from
last time instead of calling ts_new_query again. The patch is attached.

The elisp code for the benchmarks is also attached, but I'll give a
summary here:

The queries are tree-sitter-langs' highlights.scm for C.

Benchmark 1 runs treesit-font-lock-fontify-region once on the entire
buffer, meaning the query is compiled only once in both cases

Benchmark 2 runs treesit-font-lock-fontify-region on blocks of 60 lines,
meaning the no reuse version has to compile the query 10 times even
though nothing changes in the buffer or query.

Benchmark 3 is just 2 done all the way. xdisp.c has 36k lines, so the
6.06s is consistent
(600 lines = 0.10s, multiply by 60 ⇒ 36k lines ~= 6.00s).


So, is caching worth it? I don't know. It definetily is if it's possible
to do it internally without introducing a new object type. But I don't
think that's possible without making a hash map or a complicated cache
like the one for compiled regexps that compile_pattern uses in
search.c.


-- Yoav

[-- Attachment #2: bench.tar.gz --]
[-- Type: application/gzip, Size: 307730 bytes --]

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #3: 0001-Reuse-queries-in-a-dumb-way.patch --]
[-- Type: text/x-patch, Size: 1450 bytes --]

From ffd648dca62156d07d16d34c8d605eac59e7d822 Mon Sep 17 00:00:00 2001
From: Yoav Marco <yoavm448@gmail.com>
Date: Tue, 10 May 2022 14:04:34 +0300
Subject: [PATCH] Reuse queries in a dumb way

---
 src/treesit.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/treesit.c b/src/treesit.c
index 91114b0..490791a 100644
--- a/src/treesit.c
+++ b/src/treesit.c
@@ -1491,8 +1491,22 @@ DEFUN ("treesit-query-capture",
      querying with the same query can reuse the query object.  It also
      saves us from expanding the sexp query into a string.  I don't
      know how much time that could save though.  */
-  TSQuery *ts_query = ts_query_new (lang, source, strlen (source),
-				    &error_offset, &error_type);
+  static TSQuery *ts_query = NULL;
+  static char* prev_source = NULL;
+  if (source != prev_source)
+    {
+      printf ("Making query\n");
+      if (query)
+        ts_query_delete (ts_query);
+      ts_query = ts_query_new (lang, source, strlen (source),
+                               &error_offset, &error_type);
+    }
+  else
+    {
+      printf ("Reusing query\n");
+    }
+
+  prev_source = source;
   TSQueryCursor *cursor = ts_query_cursor_new ();
 
   if (ts_query == NULL)
@@ -1555,7 +1569,6 @@ DEFUN ("treesit-query-capture",
 	  result = prev_result;
 	}
     }
-  ts_query_delete (ts_query);
   ts_query_cursor_delete (cursor);
   return Fnreverse (result);
 }
-- 
2.35.3


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
       [not found]   ` <87lev9wyll.fsf@gmail.com>
@ 2022-05-10 15:20     ` Yoav Marco
  0 siblings, 0 replies; 187+ messages in thread
From: Yoav Marco @ 2022-05-10 15:20 UTC (permalink / raw)
  To: Yuan Fu; +Cc: emacs-devel

[-- Attachment #1: Type: text/plain, Size: 1485 bytes --]

I benchmarked query compilation reuse:
|   |                                      | no reuse (now) | reuse |
| 1 | Fontify xdisp.c all at once          |          0.01s | 0.01s |
| 2 | Fontify 60 next lines of xdisp.c ×10 |          0.10s | 0.00s |
| 3 | Fontify 60 next lines till the end   |          6.06s | 0.01s |


The patch to reuse the query is pretty dumb: if the char* for the query
string didn't change from last time, it reuses the TSQuery object from
last time instead of calling ts_new_query again. The patch is attached.

The elisp code for the benchmarks is also attached, but I'll give a
summary here:

The queries are tree-sitter-langs' highlights.scm for C.

Benchmark 1 runs treesit-font-lock-fontify-region once on the entire
buffer, meaning the query is compiled only once in both cases

Benchmark 2 runs treesit-font-lock-fontify-region on blocks of 60 lines,
meaning the no reuse version has to compile the query 10 times even
though nothing changes in the buffer or query.

Benchmark 3 is just 2 done all the way. xdisp.c has 36k lines, so the
6.06s is consistent
(600 lines = 0.10s, multiply by 60 ⇒ 36k lines ~= 6.00s).


So, is caching worth it? I don't know. It definetily is if it's possible
to do it internally without introducing a new object type. But I don't
think that's possible without making a hash map or a complicated cache
like the one for compiled regexps that compile_pattern uses in search.c.


-- Yoav

[-- Attachment #2: bench.tar.gz --]
[-- Type: application/gzip, Size: 307730 bytes --]

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #3: 0001-Reuse-queries-in-a-dumb-way.patch --]
[-- Type: text/x-patch, Size: 1450 bytes --]

From ffd648dca62156d07d16d34c8d605eac59e7d822 Mon Sep 17 00:00:00 2001
From: Yoav Marco <yoavm448@gmail.com>
Date: Tue, 10 May 2022 14:04:34 +0300
Subject: [PATCH] Reuse queries in a dumb way

---
 src/treesit.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/treesit.c b/src/treesit.c
index 91114b0..490791a 100644
--- a/src/treesit.c
+++ b/src/treesit.c
@@ -1491,8 +1491,22 @@ DEFUN ("treesit-query-capture",
      querying with the same query can reuse the query object.  It also
      saves us from expanding the sexp query into a string.  I don't
      know how much time that could save though.  */
-  TSQuery *ts_query = ts_query_new (lang, source, strlen (source),
-				    &error_offset, &error_type);
+  static TSQuery *ts_query = NULL;
+  static char* prev_source = NULL;
+  if (source != prev_source)
+    {
+      printf ("Making query\n");
+      if (query)
+        ts_query_delete (ts_query);
+      ts_query = ts_query_new (lang, source, strlen (source),
+                               &error_offset, &error_type);
+    }
+  else
+    {
+      printf ("Reusing query\n");
+    }
+
+  prev_source = source;
   TSQueryCursor *cursor = ts_query_cursor_new ();
 
   if (ts_query == NULL)
@@ -1555,7 +1569,6 @@ DEFUN ("treesit-query-capture",
 	  result = prev_result;
 	}
     }
-  ts_query_delete (ts_query);
   ts_query_cursor_delete (cursor);
   return Fnreverse (result);
 }
-- 
2.35.3


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-09 21:10                                             ` Yuan Fu
@ 2022-05-09 21:33                                               ` Theodor Thornhill
  2022-05-14  0:03                                                 ` Yuan Fu
  0 siblings, 1 reply; 187+ messages in thread
From: Theodor Thornhill @ 2022-05-09 21:33 UTC (permalink / raw)
  To: Yuan Fu, Eli Zaretskii; +Cc: Stefan Monnier, Emacs Devel, Daniel Colascione

>
> I have some comments below, I haven’t tested the patch yet.
>

Thank you!

> +(defvar js-treesit-font-lock-settings-1
> +  '((javascript
> +     (
> +      ((identifier) @font-lock-constant-face
> +       (:match "^[A-Z_][A-Z_\\d]*$" @font-lock-constant-face))
>
> I would use treesit-expand-query to “expand” the sexp query to string,
> so Emacs don’t need to re-expand it every time treesit-query-capture
> is called. I don’t know how much it speed things up, but hey its free.
>

Great, I did not know of this.  I'll try it out.

> +(defun js-treesit-move-to-node (fn)
> +  (when-let ((found-node (treesit-parent-until
> +                          (treesit-node-at (point) (point) 'javascript)
> +                          (lambda (parent)
> +                            (let ((parent-type (treesit-node-type parent)))
> +                              (or (equal "function_declaration" parent-type)
> +                                  ;;; More declarations here
> +                                  ))))))
> +    (goto-char (funcall fn found-node))))
> +
> +(defun js-treesit-beginning-of-defun (&optional arg)
> +  (js-treesit-move-to-node #'treesit-node-start))
> +
> +(defun js-treesit-end-of-defun (&optional arg)
> +  (js-treesit-move-to-node #'treesit-node-end))
>
> Maybe I could extract this into treesit.el, so major modes can specify
> simply the node name for a function definition and get function
> traversal for free.
>

Yeah!  My plan was to use `M-a` and `M-e` for siblings and `C-M-a` and
`C-M-e` for beginning/end-of-defun.  Some way of supplying "defun-nodes"
and make treesit handle the rest would be awesome.

> +(defcustom js-use-treesit-p nil
> +  "Use tree sitter for font locking, indentation and navigation"
> +  :version "29.1"
> +  :type 'boolean
> +  :safe 'booleanp)
>
> Maybe I should ditch treesit-disble-list and let major modes define
> their controlling variables like this?

Makes sense. The advantage with this is that you can decide more
granularly where you want to inject tree-sitter in the major mode
initialization phase, I guess?

>
> I also cc’d maintainer of js.el, since I don’t know anything about js.el.
>

Great!  I don't think this is "done", per se, but at least it is
something useful to look at, hopefully.

Theodor



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-09 12:23                                           ` Eli Zaretskii
@ 2022-05-09 21:10                                             ` Yuan Fu
  2022-05-09 21:33                                               ` Theodor Thornhill
  0 siblings, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-05-09 21:10 UTC (permalink / raw)
  To: Eli Zaretskii
  Cc: Theodor Thornhill, Stefan Monnier, Emacs Devel, Daniel Colascione



> On May 9, 2022, at 5:23 AM, Eli Zaretskii <eliz@gnu.org> wrote:
> 
>> From: Theodor Thornhill <theo@thornhill.no>
>> Cc: monnier@iro.umontreal.ca, casouri@gmail.com, emacs-devel@gnu.org
>> Date: Mon, 09 May 2022 14:20:13 +0200
>> 
>> Did you get to look at the patch I sent? Should these efforts, when
>> their form is discovered be applied to this branch going forward? Or
>> should we wait until it lands on master?  I can look into more modes,
>> like css and others, but I'm a little wary of touching C, as they have a
>> lot of lineage and opinions and isn't my expertise.
>> 
>> Also I'm sure that bugs will be found when it is used, so I'm in favour
>> of applying the modes to this branch, if only to discover how best to
>> use tree sitter.
> 
> I'll leave it to Yuan's decision.  I didn't yet have time to build the
> branch, and without that I don't want to apply changes I cannot test.
> 
> Thanks.

I have some comments below, I haven’t tested the patch yet.

+(defvar js-treesit-font-lock-settings-1
+  '((javascript
+     (
+      ((identifier) @font-lock-constant-face
+       (:match "^[A-Z_][A-Z_\\d]*$" @font-lock-constant-face))

I would use treesit-expand-query to “expand” the sexp query to string, so Emacs don’t need to re-expand it every time treesit-query-capture is called. I don’t know how much it speed things up, but hey its free.

+(defun js-treesit-move-to-node (fn)
+  (when-let ((found-node (treesit-parent-until
+                          (treesit-node-at (point) (point) 'javascript)
+                          (lambda (parent)
+                            (let ((parent-type (treesit-node-type parent)))
+                              (or (equal "function_declaration" parent-type)
+                                  ;;; More declarations here
+                                  ))))))
+    (goto-char (funcall fn found-node))))
+
+(defun js-treesit-beginning-of-defun (&optional arg)
+  (js-treesit-move-to-node #'treesit-node-start))
+
+(defun js-treesit-end-of-defun (&optional arg)
+  (js-treesit-move-to-node #'treesit-node-end))

Maybe I could extract this into treesit.el, so major modes can specify simply the node name for a function definition and get function traversal for free.

+(defcustom js-use-treesit-p nil
+  "Use tree sitter for font locking, indentation and navigation"
+  :version "29.1"
+  :type 'boolean
+  :safe 'booleanp)

Maybe I should ditch treesit-disble-list and let major modes define their controlling variables like this?

I also cc’d maintainer of js.el, since I don’t know anything about js.el.

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-09 17:50 Yoav Marco
@ 2022-05-09 20:51 ` Yuan Fu
       [not found]   ` <87lev9wyll.fsf@gmail.com>
  2022-05-10 15:43   ` Yoav Marco
  0 siblings, 2 replies; 187+ messages in thread
From: Yuan Fu @ 2022-05-09 20:51 UTC (permalink / raw)
  To: Yoav Marco; +Cc: emacs-devel



> On May 9, 2022, at 10:50 AM, Yoav Marco <yoavm448@gmail.com> wrote:
> 
> Looking at the code, isn't the while loop in treesit-query-capture
> O(n²)? It essentially amounts to
> 
>  result = nil
>  while (next capture group is avaliable) {
>        captures = nil
>        for (capture in capture group) {
>          captures = cons(capture, captures)
>        }
>        captures = nreverse(captures)
>        if (captures pass all predicates in their query)
>           result = nconc(result, captures) // <----- THE OFFENDER
>  }
> 
> A better way to do this would be to call nconc(captures, result) and
> nreverse it all at the end instead of at the end of the for loop.
> 
> An even faster way would be to add unconditionally to result and roll it
> back in case predicates fail. This doesn't use nconc at all:
> 
>  result = nil
>  while (next capture group is avaliable) {
>        prev_result = result;
>        for (capture in capture group) {
>          result = cons(capture, result)
>        }
>        if (captures *fail* at a predicate)
>           result = prev_result
>  }
>  result = nreverse(result)
> 
> 
> Context: I'm still working on profiling query compilation, and from what
> I understand of gprof's output (not much) nconc indeed is very slow
> here. Seeing nconc in the report is what made me look for nconc usage in
> treesit-query-capture.
> 
> index % time    self  children    called     name
> [1]     98.4    1.76    0.13     169+9031282 <cycle 1 as a whole> [1]
>                1.59    0.00   41385             Fnconc <cycle 1> [2]
>                0.06    0.05 2432616+3714        process_mark_stack <cycle 1> [24]
>                0.08    0.00  147643             re_match_2_internal <cycle 1> [25]
>                0.02    0.00    1707+56214       mark_char_table <cycle 1> [32]
>                0.00    0.02      18             garbage_collect <cycle 1> [33]
>                ...
> 
> The thing profiled is calling treesit-font-lock-fontify-region with c
> queries accidentally on a go file with 8k lines. Still shouldn't take
> the 2.2 seconds that it did, though.

Thanks for looking at this! I pushed your proposed fix, could you try your profile again and see if it works?

> 
>> BTW, I would appreciate for someone to look at the manual and maybe
>> touch up a bit, as I’m not a native speaker and might write something
>> not very idiomatic/fluent.
> 
> One thing I've noticed - the manual node starts by introducing
> treesit-available-p, a function that doesn't seem to exist anymore?

It’s defined in treesit.el, have you required it before checking for that function?

Yuan




^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
@ 2022-05-09 17:50 Yoav Marco
  2022-05-09 20:51 ` Yuan Fu
  0 siblings, 1 reply; 187+ messages in thread
From: Yoav Marco @ 2022-05-09 17:50 UTC (permalink / raw)
  To: casouri; +Cc: emacs-devel

Looking at the code, isn't the while loop in treesit-query-capture
O(n²)? It essentially amounts to

  result = nil
  while (next capture group is avaliable) {
        captures = nil
        for (capture in capture group) {
          captures = cons(capture, captures)
        }
        captures = nreverse(captures)
        if (captures pass all predicates in their query)
           result = nconc(result, captures) // <----- THE OFFENDER
  }

A better way to do this would be to call nconc(captures, result) and
nreverse it all at the end instead of at the end of the for loop.

An even faster way would be to add unconditionally to result and roll it
back in case predicates fail. This doesn't use nconc at all:

  result = nil
  while (next capture group is avaliable) {
        prev_result = result;
        for (capture in capture group) {
          result = cons(capture, result)
        }
        if (captures *fail* at a predicate)
           result = prev_result
  }
  result = nreverse(result)


Context: I'm still working on profiling query compilation, and from what
I understand of gprof's output (not much) nconc indeed is very slow
here. Seeing nconc in the report is what made me look for nconc usage in
treesit-query-capture.

index % time    self  children    called     name
[1]     98.4    1.76    0.13     169+9031282 <cycle 1 as a whole> [1]
                1.59    0.00   41385             Fnconc <cycle 1> [2]
                0.06    0.05 2432616+3714        process_mark_stack <cycle 1> [24]
                0.08    0.00  147643             re_match_2_internal <cycle 1> [25]
                0.02    0.00    1707+56214       mark_char_table <cycle 1> [32]
                0.00    0.02      18             garbage_collect <cycle 1> [33]
                ...

The thing profiled is calling treesit-font-lock-fontify-region with c
queries accidentally on a go file with 8k lines. Still shouldn't take
the 2.2 seconds that it did, though.


> BTW, I would appreciate for someone to look at the manual and maybe
> touch up a bit, as I’m not a native speaker and might write something
> not very idiomatic/fluent.

One thing I've noticed - the manual node starts by introducing
treesit-available-p, a function that doesn't seem to exist anymore?



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-09 12:20                                         ` Theodor Thornhill
@ 2022-05-09 12:23                                           ` Eli Zaretskii
  2022-05-09 21:10                                             ` Yuan Fu
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-09 12:23 UTC (permalink / raw)
  To: Theodor Thornhill; +Cc: monnier, casouri, emacs-devel

> From: Theodor Thornhill <theo@thornhill.no>
> Cc: monnier@iro.umontreal.ca, casouri@gmail.com, emacs-devel@gnu.org
> Date: Mon, 09 May 2022 14:20:13 +0200
> 
> Did you get to look at the patch I sent? Should these efforts, when
> their form is discovered be applied to this branch going forward? Or
> should we wait until it lands on master?  I can look into more modes,
> like css and others, but I'm a little wary of touching C, as they have a
> lot of lineage and opinions and isn't my expertise.
> 
> Also I'm sure that bugs will be found when it is used, so I'm in favour
> of applying the modes to this branch, if only to discover how best to
> use tree sitter.

I'll leave it to Yuan's decision.  I didn't yet have time to build the
branch, and without that I don't want to apply changes I cannot test.

Thanks.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-09 11:14                                       ` Eli Zaretskii
@ 2022-05-09 12:20                                         ` Theodor Thornhill
  2022-05-09 12:23                                           ` Eli Zaretskii
  0 siblings, 1 reply; 187+ messages in thread
From: Theodor Thornhill @ 2022-05-09 12:20 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: monnier, casouri, emacs-devel

>
> Yes, it does, thanks.

I'm glad.

>
> However, I don't think I see the need to have in Emacs or ELPA
> something that helps building shared libraries from the
> language-specific parser files.  The parsers themselves and the way to
> build them are outside of the Emacs scope.  And since building a
> shared library is not really complicated, and I presume there are
> prebuilt shared libraries available, it sounds like a simple job, if
> at all.

Sure - I won't argue with this.  Though I've seen that many editors
support this automatic installing.  I'm not sure if that will be a
barrier or not, but surely if it is, someone will fill that gap.

Did you get to look at the patch I sent? Should these efforts, when
their form is discovered be applied to this branch going forward? Or
should we wait until it lands on master?  I can look into more modes,
like css and others, but I'm a little wary of touching C, as they have a
lot of lineage and opinions and isn't my expertise.

Also I'm sure that bugs will be found when it is used, so I'm in favour
of applying the modes to this branch, if only to discover how best to
use tree sitter.

Theodor



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-08 20:42                       ` Dmitry Gutov
@ 2022-05-09 11:18                         ` Eli Zaretskii
  0 siblings, 0 replies; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-09 11:18 UTC (permalink / raw)
  To: Dmitry Gutov; +Cc: monnier, theo, casouri, emacs-devel

> Date: Sun, 8 May 2022 23:42:41 +0300
> Cc: monnier@iro.umontreal.ca, theo@thornhill.no, casouri@gmail.com,
>  emacs-devel@gnu.org
> From: Dmitry Gutov <dgutov@yandex.ru>
> 
> On 08.05.2022 16:21, Eli Zaretskii wrote:
> >> We're talking about borrowing from Free Software editors, aren't we?
> > You want to ask developers of those editors to assign copyright to us?
> > If they agree, the problems I mentioned indeed won't exist.
> 
> That ignores my whole point (Atom being Free Software).
> 
> We do import externally maintained pieces of code from time to time, and 
> the particular feature under discussion depends on such code anyway 
> (Tree Sitter).

Building Emacs against an external free library, or using an external
application for some Emacs-related job, is not what I had in mind.  If
this was your point, then I didn't ignore it, I just described an
issue with a different way of "borrowing" code: directly copying code
into our sources.  That cannot be done unless the author agrees, and
for substantial amounts of such code we need a copyright assignment.

I just wanted people to be aware of this.  It is better to be aware of
that up front than to use the "borrowed" code in a submission to
Emacs, just to learn that we cannot accept such a submission.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-08 19:16                                     ` Theodor Thornhill
  2022-05-08 21:14                                       ` Yuan Fu
@ 2022-05-09 11:14                                       ` Eli Zaretskii
  2022-05-09 12:20                                         ` Theodor Thornhill
  1 sibling, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-09 11:14 UTC (permalink / raw)
  To: Theodor Thornhill; +Cc: monnier, casouri, emacs-devel

> From: Theodor Thornhill <theo@thornhill.no>
> Cc: monnier@iro.umontreal.ca, casouri@gmail.com, emacs-devel@gnu.org
> Date: Sun, 08 May 2022 21:16:00 +0200
> 
> Oh right, now I understand.  The tree sitter implementation doesn't use
> an Emacs module any longer.  It uses the language definitions libraries
> like any other editor.  If you use the tree-sitter-module script you
> should use the 'batch-new.sh' script. This only creates a .so file.  My
> guess is that you could use any guide on the internet to compile such a
> shared object, then use it in emacs.  It seems to be a bit picky on the
> naming; it needs to be called 'libtree-sitter-LANG.so', which should
> probably be documented somewhere.  I cannot see that it is, yet.
> 
> So in short: Emacs cannot load the .js directly, but when downloaded
> they should be compiled to a .so/.dll/.dylib and put somewhere emacs can
> see it, such as ~/.emacs.d/tree-sitter.  This could be left to the user,
> but it would be nice for emacs to do this, or at least a package in
> elpa/nongnu elpa that does this so that the barrier of entry isn't too
> high.
> 
> I think I understood your concern now, does this answer help?

Yes, it does, thanks.

However, I don't think I see the need to have in Emacs or ELPA
something that helps building shared libraries from the
language-specific parser files.  The parsers themselves and the way to
build them are outside of the Emacs scope.  And since building a
shared library is not really complicated, and I presume there are
prebuilt shared libraries available, it sounds like a simple job, if
at all.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-08 10:33                           ` Eli Zaretskii
  2022-05-08 13:47                             ` Theodor Thornhill
@ 2022-05-08 22:42                             ` Stephen Leake
  1 sibling, 0 replies; 187+ messages in thread
From: Stephen Leake @ 2022-05-08 22:42 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: Theodor Thornhill, casouri, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

>> Date: Sun, 08 May 2022 11:19:31 +0200
>> From: Theodor Thornhill <theo@thornhill.no>
>> Cc: casouri@gmail.com, emacs-devel@gnu.org
>> 
>> You need the parser on a path emacs can see. Either we need to
>> implement some generic parser
>> installation functions in emacs, or we need this module or others
>> like it. It compiles the grammar and creates
>> a shared object that is read by emacs tree sitter. This is the
>> tedious and error prone part.
>
> I don't understand.  I thought tree-sitter itself was such a parser.
> Why do we need another one as an Emacs module?

Tree-sitter is a parser _generator_, and a run-time for supporting such
generated parsers.

The Emacs tree-sitter module has only the tree-sitter run-time, not the
parse table and other stuff generated for the language-specific parser.

-- 
-- Stephe



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-08 19:16                                     ` Theodor Thornhill
@ 2022-05-08 21:14                                       ` Yuan Fu
  2022-05-09 11:14                                       ` Eli Zaretskii
  1 sibling, 0 replies; 187+ messages in thread
From: Yuan Fu @ 2022-05-08 21:14 UTC (permalink / raw)
  To: Theodor Thornhill; +Cc: Eli Zaretskii, Stefan Monnier, emacs-devel



> On May 8, 2022, at 12:16 PM, Theodor Thornhill <theo@thornhill.no> wrote:
> 
>> 
>> My understanding was that tree-sitter can load these *.js parsers as
>> they are downloaded from the respective sites, or that there are tools
>> available to compile them if needed.  Why cannot we leave it to users
>> to do that by themselves, instead of using an Emacs module?
> 
> Oh right, now I understand.  The tree sitter implementation doesn't use
> an Emacs module any longer.  It uses the language definitions libraries
> like any other editor.  If you use the tree-sitter-module script you
> should use the 'batch-new.sh' script. This only creates a .so file.  My
> guess is that you could use any guide on the internet to compile such a
> shared object, then use it in emacs.  It seems to be a bit picky on the
> naming; it needs to be called 'libtree-sitter-LANG.so', which should
> probably be documented somewhere.  I cannot see that it is, yet.

libtree-sitter-LANG.so/dylib/dll seems to be the convention for language definition libraries. If there are irregular ones, one can use treesit-load-name-override-list to accommodate that (which is documented in the manual).

> 
> So in short: Emacs cannot load the .js directly, but when downloaded
> they should be compiled to a .so/.dll/.dylib and put somewhere emacs can
> see it, such as ~/.emacs.d/tree-sitter.  This could be left to the user,
> but it would be nice for emacs to do this, or at least a package in
> elpa/nongnu elpa that does this so that the barrier of entry isn't too
> high.

I don’t think anyone uses the .js grammar files directly, all the editors that support tree-sitter uses the compiled .so files, and .so files are what each language definition distributes. Also the compiler that compiles the .js grammar is written in IIRC nodejs, which I don’t want to come close to whenever possible.

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-08 13:23                       ` Eli Zaretskii
@ 2022-05-08 20:57                         ` Dmitry Gutov
  0 siblings, 0 replies; 187+ messages in thread
From: Dmitry Gutov @ 2022-05-08 20:57 UTC (permalink / raw)
  To: Eli Zaretskii, Stefan Monnier; +Cc: theo, casouri, emacs-devel

On 08.05.2022 16:23, Eli Zaretskii wrote:
>> From: Stefan Monnier<monnier@iro.umontreal.ca>
>> Cc: Eli Zaretskii<eliz@gnu.org>,theo@thornhill.no,casouri@gmail.com,
>>    emacs-devel@gnu.org
>> Date: Sun, 08 May 2022 08:16:48 -0400
>>
>> Dmitry Gutov [2022-05-08 15:05:06] wrote:
>>> On 08.05.2022 09:18, Eli Zaretskii wrote:
>>>> While doing so, we should be aware and beware of the potential
>>>> copyright issues.  It is best for someone to read the code and
>>>> describe the ideas, and for someone else to implement those ideas
>>>> without looking at the code.
>>> We're talking about borrowing from Free Software editors, aren't we?
>> And we're not talking about borrowing code, but talking about sharing
>> the*format*  of the indentation and highlighting rules.
> AFAIU, we are just waving hands, because what code will be borrowed
> and whether it will be is anyone's guess at this point.  So I just
> wanted people to be aware of the issue when they borrow ... whatever.

Atom uses format like:

https://github.com/atom/language-javascript/pull/608/files

Support for it was implemented in 
https://github.com/atom/atom/pull/18321/ (but it would be easier to 
reimplement that in Elisp than try to "export" it via some kind of JS 
translation).

https://github.com/emacsmirror/tree-sitter-indent/blob/master/tree-sitter-indent.el 
is an existing third-party package for Emacs for Tree Sitter which does 
that. Though they seem to copy the indentation rules by hand (for now, 
only for Rust and Julia, it seems).

It should be possible to automate the latter export, though as long as 
the rules fit on one screen, manual translation should remain feasible.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-08 13:21                     ` Eli Zaretskii
@ 2022-05-08 20:42                       ` Dmitry Gutov
  2022-05-09 11:18                         ` Eli Zaretskii
  0 siblings, 1 reply; 187+ messages in thread
From: Dmitry Gutov @ 2022-05-08 20:42 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: monnier, theo, casouri, emacs-devel

On 08.05.2022 16:21, Eli Zaretskii wrote:
>> We're talking about borrowing from Free Software editors, aren't we?
> You want to ask developers of those editors to assign copyright to us?
> If they agree, the problems I mentioned indeed won't exist.

That ignores my whole point (Atom being Free Software).

We do import externally maintained pieces of code from time to time, and 
the particular feature under discussion depends on such code anyway 
(Tree Sitter).

But see the other reply.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-08 14:42                                   ` Eli Zaretskii
@ 2022-05-08 19:16                                     ` Theodor Thornhill
  2022-05-08 21:14                                       ` Yuan Fu
  2022-05-09 11:14                                       ` Eli Zaretskii
  0 siblings, 2 replies; 187+ messages in thread
From: Theodor Thornhill @ 2022-05-08 19:16 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: monnier, casouri, emacs-devel

>
> My understanding was that tree-sitter can load these *.js parsers as
> they are downloaded from the respective sites, or that there are tools
> available to compile them if needed.  Why cannot we leave it to users
> to do that by themselves, instead of using an Emacs module?

Oh right, now I understand.  The tree sitter implementation doesn't use
an Emacs module any longer.  It uses the language definitions libraries
like any other editor.  If you use the tree-sitter-module script you
should use the 'batch-new.sh' script. This only creates a .so file.  My
guess is that you could use any guide on the internet to compile such a
shared object, then use it in emacs.  It seems to be a bit picky on the
naming; it needs to be called 'libtree-sitter-LANG.so', which should
probably be documented somewhere.  I cannot see that it is, yet.

So in short: Emacs cannot load the .js directly, but when downloaded
they should be compiled to a .so/.dll/.dylib and put somewhere emacs can
see it, such as ~/.emacs.d/tree-sitter.  This could be left to the user,
but it would be nice for emacs to do this, or at least a package in
elpa/nongnu elpa that does this so that the barrier of entry isn't too
high.

I think I understood your concern now, does this answer help?

Theodor



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-08 14:25                                 ` Theodor Thornhill
@ 2022-05-08 14:42                                   ` Eli Zaretskii
  2022-05-08 19:16                                     ` Theodor Thornhill
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-08 14:42 UTC (permalink / raw)
  To: Theodor Thornhill; +Cc: monnier, casouri, emacs-devel

> Date: Sun, 08 May 2022 16:25:43 +0200
> From: Theodor Thornhill <theo@thornhill.no>
> Cc: Eli Zaretskii <eliz@gnu.org>, casouri@gmail.com, emacs-devel@gnu.org
> 
>  The way I understand it, to use TS for a given language, you need: 
>  - The TS runtime (which should be included in Emacs if you use the 
>    corresponding feature branch and which include what I'd called "the 
>    incremental parser"). 
>  - The language's grammar.  This is usually a .js file and is compiled to 
>    an .so file by the tree sitter tools.  This is *not* included in the 
>    feature branch (neither the grammars nor the tools). 
>  - The glue in the major mode code. 
> 
> Yes, and the tree-sitter-module repo provides a way to compile these grammar.js files into an object emacs
> understands.

My understanding was that tree-sitter can load these *.js parsers as
they are downloaded from the respective sites, or that there are tools
available to compile them if needed.  Why cannot we leave it to users
to do that by themselves, instead of using an Emacs module?



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-08 14:01                               ` Stefan Monnier
@ 2022-05-08 14:25                                 ` Theodor Thornhill
  2022-05-08 14:42                                   ` Eli Zaretskii
  0 siblings, 1 reply; 187+ messages in thread
From: Theodor Thornhill @ 2022-05-08 14:25 UTC (permalink / raw)
  To: Stefan Monnier; +Cc: Eli Zaretskii, casouri, emacs-devel

[-- Attachment #1: Type: text/html, Size: 2175 bytes --]

^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-08 13:47                             ` Theodor Thornhill
  2022-05-08 13:58                               ` Eli Zaretskii
@ 2022-05-08 14:01                               ` Stefan Monnier
  2022-05-08 14:25                                 ` Theodor Thornhill
  1 sibling, 1 reply; 187+ messages in thread
From: Stefan Monnier @ 2022-05-08 14:01 UTC (permalink / raw)
  To: Theodor Thornhill; +Cc: Eli Zaretskii, casouri, emacs-devel

Theodor Thornhill [2022-05-08 15:47:48] wrote:
> On 8 May 2022 12:33, Eli Zaretskii <eliz@gnu.org> wrote:
>  > Date: Sun, 08 May 2022 11:19:31 +0200 
>  > From: Theodor Thornhill <theo@thornhill.no> 
>  > Cc: casouri@gmail.com, emacs-devel@gnu.org 
>  > 
>  > You need the parser on a path emacs can see. Either we need to implement some
>  generic parser 
>  > installation functions in emacs, or we need this module or others like it. It compiles
>  the grammar and creates 
>  > a shared object that is read by emacs tree sitter. This is the tedious and error prone
>  part. 
>
>  I don't understand.  I thought tree-sitter itself was such a parser. 
>  Why do we need another one as an Emacs module? 
>
> Because the only implementation possible for emacs is the generic parser. It needs
> specific parsers to be inserted. The two are orthogonal, but related.
>
> Emacs cannot and should not contain such parser binaries. They are os and architecture
> specific. 
>
> Also, this way the parsers can more easily be fixed
>
> Does that make sense?

Not really, no.
You might want to be more precise about what you mean by "parser" and
explain what's the difference between the "generic parser" and more
specific ones.

By "specific parser" are you referring to something that I might call
"compiled grammars"?

The way I understand it, to use TS for a given language, you need:
- The TS runtime (which should be included in Emacs if you use the
  corresponding feature branch and which include what I'd called "the
  incremental parser").
- The language's grammar.  This is usually a .js file and is compiled to
  an .so file by the tree sitter tools.  This is *not* included in the
  feature branch (neither the grammars nor the tools).
- The glue in the major mode code.


        Stefan




^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-08 13:47                             ` Theodor Thornhill
@ 2022-05-08 13:58                               ` Eli Zaretskii
  2022-05-08 14:01                               ` Stefan Monnier
  1 sibling, 0 replies; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-08 13:58 UTC (permalink / raw)
  To: Theodor Thornhill; +Cc: casouri, emacs-devel

> Date: Sun, 08 May 2022 15:47:48 +0200
> From: Theodor Thornhill <theo@thornhill.no>
> Cc: casouri@gmail.com, emacs-devel@gnu.org
> 
>  I don't understand.  I thought tree-sitter itself was such a parser. 
>  Why do we need another one as an Emacs module? 
> 
> Because the only implementation possible for emacs is the generic parser. It needs specific parsers to be
> inserted. The two are orthogonal, but related.
> 
> Emacs cannot and should not contain such parser binaries. They are os and architecture specific. 
> 
> Also, this way the parsers can more easily be fixed
> 
> Does that make sense?

I cannot say, sorry.  Because I still don't understand the need.

Let me ask this another way: what job does this parser do, and how it
is different from the parsing performed by tree-sitter itself?



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-08 10:33                           ` Eli Zaretskii
@ 2022-05-08 13:47                             ` Theodor Thornhill
  2022-05-08 13:58                               ` Eli Zaretskii
  2022-05-08 14:01                               ` Stefan Monnier
  2022-05-08 22:42                             ` Stephen Leake
  1 sibling, 2 replies; 187+ messages in thread
From: Theodor Thornhill @ 2022-05-08 13:47 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: casouri, emacs-devel

[-- Attachment #1: Type: text/html, Size: 1594 bytes --]

^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-08 12:16                     ` Stefan Monnier
@ 2022-05-08 13:23                       ` Eli Zaretskii
  2022-05-08 20:57                         ` Dmitry Gutov
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-08 13:23 UTC (permalink / raw)
  To: Stefan Monnier; +Cc: dgutov, theo, casouri, emacs-devel

> From: Stefan Monnier <monnier@iro.umontreal.ca>
> Cc: Eli Zaretskii <eliz@gnu.org>,  theo@thornhill.no,  casouri@gmail.com,
>   emacs-devel@gnu.org
> Date: Sun, 08 May 2022 08:16:48 -0400
> 
> Dmitry Gutov [2022-05-08 15:05:06] wrote:
> > On 08.05.2022 09:18, Eli Zaretskii wrote:
> >> While doing so, we should be aware and beware of the potential
> >> copyright issues.  It is best for someone to read the code and
> >> describe the ideas, and for someone else to implement those ideas
> >> without looking at the code.
> > We're talking about borrowing from Free Software editors, aren't we?
> 
> And we're not talking about borrowing code, but talking about sharing
> the *format* of the indentation and highlighting rules.

AFAIU, we are just waving hands, because what code will be borrowed
and whether it will be is anyone's guess at this point.  So I just
wanted people to be aware of the issue when they borrow ... whatever.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-08 12:05                   ` Dmitry Gutov
  2022-05-08 12:16                     ` Stefan Monnier
@ 2022-05-08 13:21                     ` Eli Zaretskii
  2022-05-08 20:42                       ` Dmitry Gutov
  1 sibling, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-08 13:21 UTC (permalink / raw)
  To: Dmitry Gutov; +Cc: monnier, theo, casouri, emacs-devel

> Date: Sun, 8 May 2022 15:05:06 +0300
> Cc: theo@thornhill.no, casouri@gmail.com, emacs-devel@gnu.org
> From: Dmitry Gutov <dgutov@yandex.ru>
> 
> On 08.05.2022 09:18, Eli Zaretskii wrote:
> > While doing so, we should be aware and beware of the potential
> > copyright issues.  It is best for someone to read the code and
> > describe the ideas, and for someone else to implement those ideas
> > without looking at the code.
> 
> We're talking about borrowing from Free Software editors, aren't we?

You want to ask developers of those editors to assign copyright to us?
If they agree, the problems I mentioned indeed won't exist.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-08 12:05                   ` Dmitry Gutov
@ 2022-05-08 12:16                     ` Stefan Monnier
  2022-05-08 13:23                       ` Eli Zaretskii
  2022-05-08 13:21                     ` Eli Zaretskii
  1 sibling, 1 reply; 187+ messages in thread
From: Stefan Monnier @ 2022-05-08 12:16 UTC (permalink / raw)
  To: Dmitry Gutov; +Cc: Eli Zaretskii, theo, casouri, emacs-devel

Dmitry Gutov [2022-05-08 15:05:06] wrote:
> On 08.05.2022 09:18, Eli Zaretskii wrote:
>> While doing so, we should be aware and beware of the potential
>> copyright issues.  It is best for someone to read the code and
>> describe the ideas, and for someone else to implement those ideas
>> without looking at the code.
> We're talking about borrowing from Free Software editors, aren't we?

And we're not talking about borrowing code, but talking about sharing
the *format* of the indentation and highlighting rules.  The idea being
that those rules can then be distributed together with the grammar.


        Stefan




^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-08  6:18                 ` Eli Zaretskii
@ 2022-05-08 12:05                   ` Dmitry Gutov
  2022-05-08 12:16                     ` Stefan Monnier
  2022-05-08 13:21                     ` Eli Zaretskii
  0 siblings, 2 replies; 187+ messages in thread
From: Dmitry Gutov @ 2022-05-08 12:05 UTC (permalink / raw)
  To: Eli Zaretskii, Stefan Monnier; +Cc: theo, casouri, emacs-devel

On 08.05.2022 09:18, Eli Zaretskii wrote:
> While doing so, we should be aware and beware of the potential
> copyright issues.  It is best for someone to read the code and
> describe the ideas, and for someone else to implement those ideas
> without looking at the code.

We're talking about borrowing from Free Software editors, aren't we?



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-08  9:19                         ` Theodor Thornhill
@ 2022-05-08 10:33                           ` Eli Zaretskii
  2022-05-08 13:47                             ` Theodor Thornhill
  2022-05-08 22:42                             ` Stephen Leake
  0 siblings, 2 replies; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-08 10:33 UTC (permalink / raw)
  To: Theodor Thornhill; +Cc: casouri, emacs-devel

> Date: Sun, 08 May 2022 11:19:31 +0200
> From: Theodor Thornhill <theo@thornhill.no>
> Cc: casouri@gmail.com, emacs-devel@gnu.org
> 
> You need the parser on a path emacs can see. Either we need to implement some generic parser
> installation functions in emacs, or we need this module or others like it. It compiles the grammar and creates
> a shared object that is read by emacs tree sitter. This is the tedious and error prone part. 

I don't understand.  I thought tree-sitter itself was such a parser.
Why do we need another one as an Emacs module?



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-08  9:10                       ` Eli Zaretskii
@ 2022-05-08  9:19                         ` Theodor Thornhill
  2022-05-08 10:33                           ` Eli Zaretskii
  0 siblings, 1 reply; 187+ messages in thread
From: Theodor Thornhill @ 2022-05-08  9:19 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: casouri, emacs-devel

[-- Attachment #1: Type: text/html, Size: 1714 bytes --]

^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-08  9:02                     ` Theodor Thornhill
  2022-05-08  9:09                       ` Theodor Thornhill
@ 2022-05-08  9:10                       ` Eli Zaretskii
  2022-05-08  9:19                         ` Theodor Thornhill
  1 sibling, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-08  9:10 UTC (permalink / raw)
  To: Theodor Thornhill; +Cc: casouri, emacs-devel

> From: Theodor Thornhill <theo@thornhill.no>
> Cc: casouri@gmail.com, emacs-devel@gnu.org
> Date: Sun, 08 May 2022 11:02:10 +0200
> 
> Ok, see the attached patch.  This makes the normal js-mode support tree
> sitter.
> 
> Some caveats.
> 
> 1. You need to install the tree sitter parser.  Use Yuans
> tree-sitter-module [1] project for this

Why is that module needed, and where is it (it seems like the
reference was missing from your message)?



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-08  9:02                     ` Theodor Thornhill
@ 2022-05-08  9:09                       ` Theodor Thornhill
  2022-05-08  9:10                       ` Eli Zaretskii
  1 sibling, 0 replies; 187+ messages in thread
From: Theodor Thornhill @ 2022-05-08  9:09 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: casouri, emacs-devel


Forgot to add link, sorry for that

> 1. You need to install the tree sitter parser.  Use Yuans
> tree-sitter-module [1] project for this


Theodor

[1]: https://github.com/casouri/tree-sitter-module.git




^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-08  6:58                   ` Eli Zaretskii
@ 2022-05-08  9:02                     ` Theodor Thornhill
  2022-05-08  9:09                       ` Theodor Thornhill
  2022-05-08  9:10                       ` Eli Zaretskii
  0 siblings, 2 replies; 187+ messages in thread
From: Theodor Thornhill @ 2022-05-08  9:02 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: casouri, emacs-devel

[-- Attachment #1: Type: text/plain, Size: 1084 bytes --]

>
> Yes, why not?
>
>> I'll just put it behind a "js-mode-use-treesit-p" defcustom or something like that?
>
> Something like that, yes.

Ok, see the attached patch.  This makes the normal js-mode support tree
sitter.

Some caveats.

1. You need to install the tree sitter parser.  Use Yuans
tree-sitter-module [1] project for this
2. Put the javascript grammar inside ~/.emacs.d/tree-sitter/
3. That should be it.
4. No wait, you need to set 'js-use-treesit-p' to 't' for this to work :)


This should yield decent indentation and syntax highlighting, and should
be sufficient for daily usage, I believe.  There are surely many things
that can improve, such as the navigation.  Now we only support
beginning-of-defun when inside of functions, but this is easily
extendible.  However, I'm not completely sold on the best way to deal
with that.  Suggestions welcome here.

Anyways. Please try it out and report what you think.

This is just a quick "look how easy it is to implement things using tree
sitter", but I think it is a good starting point.

All the best,
Theodor Thornhill


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-Add-tree-sitter-functionality-to-js-mode.patch --]
[-- Type: text/x-diff, Size: 16812 bytes --]

From 1031bcf9af23d7c74af00f6132acc27756cc7721 Mon Sep 17 00:00:00 2001
From: Theodor Thornhill <theo@thornhill.no>
Date: Sun, 8 May 2022 10:52:56 +0200
Subject: [PATCH] Add tree sitter functionality to js-mode

* lisp/progmodes/js.el (js-use-treesit-p): New defcustom to control
whether to use tree sitter or not.

(js-treesit-backward-up-list): Utility function to find the scope when
no node is found.

(js-treesit-indent-rules): Rules for the simple indent engine.

(js-treesit-font-lock-settings-1): Queries for font locking.  Only one
level thus far.

(js-treesit-move-to-node, js-treesit-beginning-of-defun)
(js-treesit-end-of-defun): Utility functions to find a function from
point.  Only supports function thus far.

(js-treesit-enable): Function to enable tree sitter functionality.

(js-mode): Wrap the js-use-treesit-p defcustom around mode
initialization so that we can choose the implementation to use.
---
 lisp/progmodes/js.el | 391 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 311 insertions(+), 80 deletions(-)

diff --git a/lisp/progmodes/js.el b/lisp/progmodes/js.el
index 9c1358e466..cc00f4a7e4 100644
--- a/lisp/progmodes/js.el
+++ b/lisp/progmodes/js.el
@@ -3404,6 +3404,235 @@ js-jsx--detect-after-change
 (c-lang-defconst c-paragraph-start
   js-mode "\\(@[[:alpha:]]+\\>\\|$\\)")
 
+;;; Tree sitter integration
+(defcustom js-use-treesit-p nil
+  "Use tree sitter for font locking, indentation and navigation"
+  :version "29.1"
+  :type 'boolean
+  :safe 'booleanp)
+
+(defun js-treesit-backward-up-list ()
+  (lambda (node parent bol &rest _)
+    (save-excursion
+      (backward-up-list 1 nil t)
+      (goto-char
+       (treesit-node-start
+        (treesit-node-at (point) (point) 'javascript)))
+      (back-to-indentation)
+      (point))))
+
+(defvar js-treesit-indent-rules
+  `((javascript
+     (no-node (js-treesit-backward-up-list) ,js-indent-level)
+     ((node-is "}") parent-bol 0)
+     ((node-is ")") parent-bol 0)
+     ((node-is "]") parent-bol 0)
+     ((node-is ">") parent-bol 0)
+     ((node-is ".") parent-bol ,js-indent-level)
+     ((parent-is "named_imports") parent-bol ,js-indent-level)
+     ((parent-is "statement_block") parent-bol ,js-indent-level)
+     ((parent-is "variable_declarator") parent-bol ,js-indent-level)
+     ((parent-is "arguments") parent-bol ,js-indent-level)
+     ((parent-is "array") parent-bol ,js-indent-level)
+     ((parent-is "formal_parameters") parent-bol ,js-indent-level)
+     ((parent-is "template_substitution") parent-bol ,js-indent-level)
+     ((parent-is "object_pattern") parent-bol ,js-indent-level)
+     ((parent-is "object") parent-bol ,js-indent-level)
+     ((parent-is "arrow_function") parent-bol ,js-indent-level)
+     ((parent-is "parenthesized_expression") parent-bol ,js-indent-level)
+
+     ;; JSX
+     ((parent-is "jsx_opening_element") parent ,js-indent-level)
+     ((node-is "jsx_closing_element") parent 0)
+     ((node-is "jsx_text") parent ,js-indent-level)
+     ((parent-is "jsx_element") parent ,js-indent-level)
+     ;; TODO(Theo): This one is a little off.  Meant to hit the dangling '/' in
+     ;; a jsx-element.  But it is also division operator...
+     ((node-is "/") parent 0)
+     ((parent-is "jsx_self_closing_element") parent ,js-indent-level))))
+
+(defvar js-treesit-font-lock-settings-1
+  '((javascript
+     (
+      ((identifier) @font-lock-constant-face
+       (:match "^[A-Z_][A-Z_\\d]*$" @font-lock-constant-face))
+
+      (new_expression
+       constructor: (identifier) @font-lock-type-face)
+
+      (function
+       name: (identifier) @font-lock-function-name-face)
+
+      (function_declaration
+       name: (identifier) @font-lock-function-name-face)
+
+      (method_definition
+       name: (property_identifier) @font-lock-function-name-face)
+
+      (variable_declarator
+       name: (identifier) @font-lock-function-name-face
+       value: [(function) (arrow_function)])
+
+      (variable_declarator
+       name: (array_pattern (identifier) (identifier) @font-lock-function-name-face)
+       value: (array (number) (function)))
+
+      (assignment_expression
+       left: [(identifier) @font-lock-function-name-face
+              (member_expression property: (property_identifier) @font-lock-function-name-face)]
+       right: [(function) (arrow_function)])
+
+      (call_expression
+       function: [(identifier) @font-lock-function-name-face
+                  (member_expression
+                   property: (property_identifier) @font-lock-function-name-face)])
+
+      (variable_declarator
+       name: (identifier) @font-lock-variable-name-face)
+
+      (assignment_expression
+       left: [(identifier) @font-lock-variable-name-face
+              (member_expression property: (property_identifier) @font-lock-variable-name-face)])
+
+      (for_in_statement
+       left: (identifier) @font-lock-variable-name-face)
+
+      (arrow_function
+       parameter: (identifier) @font-lock-variable-name-face)
+
+      (arrow_function
+       parameters: [(_ (identifier) @font-lock-variable-name-face)
+                    (_ (_ (identifier) @font-lock-variable-name-face))
+                    (_ (_ (_ (identifier) @font-lock-variable-name-face)))])
+
+
+      (pair key: (property_identifier) @font-lock-variable-name-face)
+
+      (pair value: (identifier) @font-lock-variable-name-face)
+
+      (pair
+       key: (property_identifier) @font-lock-function-name-face
+       value: [(function) (arrow_function)])
+
+      ((shorthand_property_identifier) @font-lock-variable-name-face)
+
+      (pair_pattern key: (property_identifier) @font-lock-variable-name-face)
+
+      ((shorthand_property_identifier_pattern) @font-lock-variable-name-face)
+
+      (array_pattern (identifier) @font-lock-variable-name-face)
+
+      (jsx_opening_element [(nested_identifier (identifier)) (identifier)] @font-lock-function-name-face)
+      (jsx_closing_element [(nested_identifier (identifier)) (identifier)] @font-lock-function-name-face)
+      (jsx_self_closing_element [(nested_identifier (identifier)) (identifier)] @font-lock-function-name-face)
+      (jsx_attribute (property_identifier) @font-lock-constant-face)
+
+      [(this) (super)] @font-lock-keyword-face
+
+      [(true) (false) (null)] @font-lock-constant-face
+      ;; (regex pattern: (regex_pattern))
+      (number) @font-lock-constant-face
+
+      (string) @font-lock-string-face
+
+      ;; template strings need to be last in the file for embedded expressions
+      ;; to work properly
+      (template_string) @font-lock-string-face
+
+      (template_substitution
+       "${" @font-lock-constant-face
+       (_)
+       "}" @font-lock-constant-face
+       )
+
+      ["as"
+       "async"
+       "await"
+       "break"
+       "case"
+       "catch"
+       "class"
+       "const"
+       "continue"
+       "debugger"
+       "default"
+       "delete"
+       "do"
+       "else"
+       "export"
+       "extends"
+       "finally"
+       "for"
+       "from"
+       "function"
+       "get"
+       "if"
+       "import"
+       "in"
+       "instanceof"
+       "let"
+       "new"
+       "of"
+       "return"
+       "set"
+       "static"
+       "switch"
+       "switch"
+       "target"
+       "throw"
+       "try"
+       "typeof"
+       "var"
+       "void"
+       "while"
+       "with"
+       "yield"] @font-lock-keyword-face
+
+      (comment) @font-lock-comment-face
+      ))))
+
+(defun js-treesit-move-to-node (fn)
+  (when-let ((found-node (treesit-parent-until
+                          (treesit-node-at (point) (point) 'javascript)
+                          (lambda (parent)
+                            (let ((parent-type (treesit-node-type parent)))
+                              (or (equal "function_declaration" parent-type)
+                                  ;;; More declarations here
+                                  ))))))
+    (goto-char (funcall fn found-node))))
+
+(defun js-treesit-beginning-of-defun (&optional arg)
+  (js-treesit-move-to-node #'treesit-node-start))
+
+(defun js-treesit-end-of-defun (&optional arg)
+  (js-treesit-move-to-node #'treesit-node-end))
+
+
+(defun js-treesit-enable ()
+  (unless (or (treesit-should-enable-p)
+              (treesit-language-available-p 'javascript))
+    (error "Tree sitter isn't available"))
+
+  ;; Comments
+  (setq-local comment-start "// ")
+  (setq-local comment-start-skip "\\(?://+\\|/\\*+\\)\\s *")
+  (setq-local comment-end "")
+
+  (treesit-get-parser-create 'javascript)
+  (setq-local treesit-simple-indent-rules js-treesit-indent-rules)
+  (setq-local indent-line-function #'treesit-indent)
+  (setq-local beginning-of-defun-function #'js-treesit-beginning-of-defun)
+  (setq-local end-of-defun-function #'js-treesit-end-of-defun)
+
+  ;; This needs to be non-nil, because reasons
+  (unless font-lock-defaults
+    (setq font-lock-defaults '(nil t)))
+
+  (setq-local treesit-font-lock-defaults
+              '((js-treesit-font-lock-settings-1)))
+
+  (treesit-font-lock-enable))
+
 ;;; Main Function
 
 ;;;###autoload
@@ -3411,86 +3640,88 @@ js-mode
   "Major mode for editing JavaScript."
   :group 'js
   ;; Ensure all CC Mode "lang variables" are set to valid values.
-  (c-init-language-vars js-mode)
-  (setq-local indent-line-function #'js-indent-line)
-  (setq-local beginning-of-defun-function #'js-beginning-of-defun)
-  (setq-local end-of-defun-function #'js-end-of-defun)
-  (setq-local open-paren-in-column-0-is-defun-start nil)
-  (setq-local font-lock-defaults
-              (list js--font-lock-keywords nil nil nil nil
-                    '(font-lock-syntactic-face-function
-                      . js-font-lock-syntactic-face-function)))
-  (setq-local syntax-propertize-function #'js-syntax-propertize)
-  (add-hook 'syntax-propertize-extend-region-functions
-            #'syntax-propertize-multiline 'append 'local)
-  (add-hook 'syntax-propertize-extend-region-functions
-            #'js--syntax-propertize-extend-region 'append 'local)
-  (setq-local prettify-symbols-alist js--prettify-symbols-alist)
-
-  (setq-local parse-sexp-ignore-comments t)
-  (setq-local which-func-imenu-joiner-function #'js--which-func-joiner)
-
-  ;; Comments
-  (setq-local comment-start "// ")
-  (setq-local comment-start-skip "\\(?://+\\|/\\*+\\)\\s *")
-  (setq-local comment-end "")
-  (setq-local fill-paragraph-function #'js-fill-paragraph)
-  (setq-local normal-auto-fill-function #'js-do-auto-fill)
-
-  ;; Parse cache
-  (add-hook 'before-change-functions #'js--flush-caches t t)
-
-  ;; Frameworks
-  (js--update-quick-match-re)
-
-  ;; Syntax extensions
-  (unless (js-jsx--detect-and-enable)
-    (add-hook 'after-change-functions #'js-jsx--detect-after-change nil t))
-  (js-use-syntactic-mode-name)
-
-  ;; Imenu
-  (setq imenu-case-fold-search nil)
-  (setq imenu-create-index-function #'js--imenu-create-index)
-
-  ;; for filling, pretend we're cc-mode
-  (c-foreign-init-lit-pos-cache)
-  (add-hook 'before-change-functions #'c-foreign-truncate-lit-pos-cache nil t)
-  (setq-local comment-line-break-function #'c-indent-new-comment-line)
-  (setq-local comment-multi-line t)
-  (setq-local electric-indent-chars
-	      (append "{}():;," electric-indent-chars)) ;FIXME: js2-mode adds "[]*".
-  (setq-local electric-layout-rules
-	      '((?\; . after) (?\{ . after) (?\} . before)))
-
-  (let ((c-buffer-is-cc-mode t))
-    ;; FIXME: These are normally set by `c-basic-common-init'.  Should
-    ;; we call it instead?  (Bug#6071)
-    (make-local-variable 'paragraph-start)
-    (make-local-variable 'paragraph-separate)
-    (make-local-variable 'paragraph-ignore-fill-prefix)
-    (make-local-variable 'adaptive-fill-mode)
-    (make-local-variable 'adaptive-fill-regexp)
-    ;; While the full CC Mode style system is not yet in use, set the
-    ;; pertinent style variables manually.
-    (c-initialize-builtin-style)
-    (let ((style (cc-choose-style-for-mode 'js-mode c-default-style)))
-      (c-set-style style))
-    (setq c-block-comment-prefix "* "
-          c-comment-prefix-regexp "//+\\|\\**")
-    (c-setup-paragraph-variables))
-
-  ;; Important to fontify the whole buffer syntactically! If we don't,
-  ;; then we might have regular expression literals that aren't marked
-  ;; as strings, which will screw up parse-partial-sexp, scan-lists,
-  ;; etc. and produce maddening "unbalanced parenthesis" errors.
-  ;; When we attempt to find the error and scroll to the portion of
-  ;; the buffer containing the problem, JIT-lock will apply the
-  ;; correct syntax to the regular expression literal and the problem
-  ;; will mysteriously disappear.
-  ;; FIXME: We should instead do this fontification lazily by adding
-  ;; calls to syntax-propertize wherever it's really needed.
-  ;;(syntax-propertize (point-max))
-  )
+  (if js-use-treesit-p
+      (js-treesit-enable)
+    (c-init-language-vars js-mode)
+    (setq-local indent-line-function #'js-indent-line)
+    (setq-local beginning-of-defun-function #'js-beginning-of-defun)
+    (setq-local end-of-defun-function #'js-end-of-defun)
+    (setq-local open-paren-in-column-0-is-defun-start nil)
+    (setq-local font-lock-defaults
+                (list js--font-lock-keywords nil nil nil nil
+                      '(font-lock-syntactic-face-function
+                        . js-font-lock-syntactic-face-function)))
+    (setq-local syntax-propertize-function #'js-syntax-propertize)
+    (add-hook 'syntax-propertize-extend-region-functions
+              #'syntax-propertize-multiline 'append 'local)
+    (add-hook 'syntax-propertize-extend-region-functions
+              #'js--syntax-propertize-extend-region 'append 'local)
+    (setq-local prettify-symbols-alist js--prettify-symbols-alist)
+
+    (setq-local parse-sexp-ignore-comments t)
+    (setq-local which-func-imenu-joiner-function #'js--which-func-joiner)
+
+    ;; Comments
+    (setq-local comment-start "// ")
+    (setq-local comment-start-skip "\\(?://+\\|/\\*+\\)\\s *")
+    (setq-local comment-end "")
+    (setq-local fill-paragraph-function #'js-fill-paragraph)
+    (setq-local normal-auto-fill-function #'js-do-auto-fill)
+
+    ;; Parse cache
+    (add-hook 'before-change-functions #'js--flush-caches t t)
+
+    ;; Frameworks
+    (js--update-quick-match-re)
+
+    ;; Syntax extensions
+    (unless (js-jsx--detect-and-enable)
+      (add-hook 'after-change-functions #'js-jsx--detect-after-change nil t))
+    (js-use-syntactic-mode-name)
+
+    ;; Imenu
+    (setq imenu-case-fold-search nil)
+    (setq imenu-create-index-function #'js--imenu-create-index)
+
+    ;; for filling, pretend we're cc-mode
+    (c-foreign-init-lit-pos-cache)
+    (add-hook 'before-change-functions #'c-foreign-truncate-lit-pos-cache nil t)
+    (setq-local comment-line-break-function #'c-indent-new-comment-line)
+    (setq-local comment-multi-line t)
+    (setq-local electric-indent-chars
+	        (append "{}():;," electric-indent-chars)) ;FIXME: js2-mode adds "[]*".
+    (setq-local electric-layout-rules
+	        '((?\; . after) (?\{ . after) (?\} . before)))
+
+    (let ((c-buffer-is-cc-mode t))
+      ;; FIXME: These are normally set by `c-basic-common-init'.  Should
+      ;; we call it instead?  (Bug#6071)
+      (make-local-variable 'paragraph-start)
+      (make-local-variable 'paragraph-separate)
+      (make-local-variable 'paragraph-ignore-fill-prefix)
+      (make-local-variable 'adaptive-fill-mode)
+      (make-local-variable 'adaptive-fill-regexp)
+      ;; While the full CC Mode style system is not yet in use, set the
+      ;; pertinent style variables manually.
+      (c-initialize-builtin-style)
+      (let ((style (cc-choose-style-for-mode 'js-mode c-default-style)))
+        (c-set-style style))
+      (setq c-block-comment-prefix "* "
+            c-comment-prefix-regexp "//+\\|\\**")
+      (c-setup-paragraph-variables))
+
+    ;; Important to fontify the whole buffer syntactically! If we don't,
+    ;; then we might have regular expression literals that aren't marked
+    ;; as strings, which will screw up parse-partial-sexp, scan-lists,
+    ;; etc. and produce maddening "unbalanced parenthesis" errors.
+    ;; When we attempt to find the error and scroll to the portion of
+    ;; the buffer containing the problem, JIT-lock will apply the
+    ;; correct syntax to the regular expression literal and the problem
+    ;; will mysteriously disappear.
+    ;; FIXME: We should instead do this fontification lazily by adding
+    ;; calls to syntax-propertize wherever it's really needed.
+    ;;(syntax-propertize (point-max))
+    ))
 
 ;; Since we made JSX support available and automatically-enabled in
 ;; the base `js-mode' (for ease of use), now `js-jsx-mode' simply
-- 
2.25.1


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-08  6:49                 ` Theodor Thornhill
@ 2022-05-08  6:58                   ` Eli Zaretskii
  2022-05-08  9:02                     ` Theodor Thornhill
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-08  6:58 UTC (permalink / raw)
  To: Theodor Thornhill; +Cc: casouri, emacs-devel

> Date: Sun, 08 May 2022 08:49:00 +0200
> From: Theodor Thornhill <theo@thornhill.no>
> Cc: casouri@gmail.com, emacs-devel@gnu.org
> 
>  My suggestion would be to modify the Emacs major modes, not to 
>  introduce special stand-alone modes.  Making the changes directly in 
>  the existing modes is a faster way towards tree-sitter integration. 
> 
> Sure! If I write a patch for js-mode today, will you be able to apply it to the feature branch?

Yes, why not?

> I'll just put it behind a "js-mode-use-treesit-p" defcustom or something like that?

Something like that, yes.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-08  6:16               ` Eli Zaretskii
@ 2022-05-08  6:49                 ` Theodor Thornhill
  2022-05-08  6:58                   ` Eli Zaretskii
  0 siblings, 1 reply; 187+ messages in thread
From: Theodor Thornhill @ 2022-05-08  6:49 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: casouri, emacs-devel

[-- Attachment #1: Type: text/html, Size: 892 bytes --]

^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-07 21:24               ` Stefan Monnier
  2022-05-07 22:02                 ` Theodor Thornhill
@ 2022-05-08  6:18                 ` Eli Zaretskii
  2022-05-08 12:05                   ` Dmitry Gutov
  1 sibling, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-08  6:18 UTC (permalink / raw)
  To: Stefan Monnier; +Cc: theo, casouri, emacs-devel

> From: Stefan Monnier <monnier@iro.umontreal.ca>
> Cc: Yuan Fu <casouri@gmail.com>,  Eli Zaretskii <eliz@gnu.org>,
>   emacs-devel@gnu.org
> Date: Sat, 07 May 2022 17:24:34 -0400
> 
> IOW I suggest looking at how Neovim/Atom/etc... specify that information
> and see if/how we can reuse it.  And if their format is not convenient
> for us, try to work with them to design a format that can be convenient
> for everyone.

While doing so, we should be aware and beware of the potential
copyright issues.  It is best for someone to read the code and
describe the ideas, and for someone else to implement those ideas
without looking at the code.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-07 20:12             ` Theodor Thornhill
  2022-05-07 21:24               ` Stefan Monnier
@ 2022-05-08  6:16               ` Eli Zaretskii
  2022-05-08  6:49                 ` Theodor Thornhill
  1 sibling, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-08  6:16 UTC (permalink / raw)
  To: Theodor Thornhill; +Cc: casouri, emacs-devel

> From: Theodor Thornhill <theo@thornhill.no>
> Cc: emacs-devel@gnu.org
> Date: Sat, 07 May 2022 22:12:37 +0200
> 
> >> I didn't expect tree-sitter to do this for us, I thought the code
> >> which integrates it into Emacs should do it, and hoped that code was
> >> already written.
> >
> > Ah, I see, that would require people more competent than me on each of
> > the languages and major modes to try using tree-sitter feature for
> > major modes, and maybe raising suggestions and concerns along the way,
> > like Theodor has done.
> 
> The quickest way to get this done is to start doing the work with some
> mode, say C, first as a separate mode maybe on the features/tree-sitter
> branch, then we can figure out later how to incorporate things?

My suggestion would be to modify the Emacs major modes, not to
introduce special stand-alone modes.  Making the changes directly in
the existing modes is a faster way towards tree-sitter integration.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-07 21:24               ` Stefan Monnier
@ 2022-05-07 22:02                 ` Theodor Thornhill
  2022-05-08  6:18                 ` Eli Zaretskii
  1 sibling, 0 replies; 187+ messages in thread
From: Theodor Thornhill @ 2022-05-07 22:02 UTC (permalink / raw)
  To: Stefan Monnier; +Cc: Yuan Fu, Eli Zaretskii, emacs-devel

[-- Attachment #1: Type: text/html, Size: 1655 bytes --]

^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-07 20:12             ` Theodor Thornhill
@ 2022-05-07 21:24               ` Stefan Monnier
  2022-05-07 22:02                 ` Theodor Thornhill
  2022-05-08  6:18                 ` Eli Zaretskii
  2022-05-08  6:16               ` Eli Zaretskii
  1 sibling, 2 replies; 187+ messages in thread
From: Stefan Monnier @ 2022-05-07 21:24 UTC (permalink / raw)
  To: Theodor Thornhill; +Cc: Yuan Fu, Eli Zaretskii, emacs-devel

BTW, a bit part of the font-lock and indentation rules is highly
dependent on the grammar being used and in addition to that, they're
likely mostly agnostic to the design of `indent-line-function` of
`font-lock.el`.

So it would make a lot of sense to try and work with other editors to
try and design an editor-agnostic way to provide that info, such that it
doesn't need to be re-done for each and every editor out there.

IOW I suggest looking at how Neovim/Atom/etc... specify that information
and see if/how we can reuse it.  And if their format is not convenient
for us, try to work with them to design a format that can be convenient
for everyone.


        Stefan




^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-07 20:00           ` Yuan Fu
@ 2022-05-07 20:12             ` Theodor Thornhill
  2022-05-07 21:24               ` Stefan Monnier
  2022-05-08  6:16               ` Eli Zaretskii
  0 siblings, 2 replies; 187+ messages in thread
From: Theodor Thornhill @ 2022-05-07 20:12 UTC (permalink / raw)
  To: Yuan Fu, Eli Zaretskii; +Cc: emacs-devel

Yuan Fu <casouri@gmail.com> writes:

>>> 
>>> That’s too much magic to ask from tree-sitter.
>> 
>> I didn't expect tree-sitter to do this for us, I thought the code
>> which integrates it into Emacs should do it, and hoped that code was
>> already written.
>
> Ah, I see, that would require people more competent than me on each of
> the languages and major modes to try using tree-sitter feature for
> major modes, and maybe raising suggestions and concerns along the way,
> like Theodor has done.

The quickest way to get this done is to start doing the work with some
mode, say C, first as a separate mode maybe on the features/tree-sitter
branch, then we can figure out later how to incorporate things?  If
there's any interest for this then I can work on that, and hopefully we
can land on something useful soon?  After that we can see how we want to
proceed?

IMO keeping such a mode in the feature-branch could be wise for the ease
of installation.  Unless there's too much toe-stepping.

WDYT?

Theodor



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-07 19:25         ` Eli Zaretskii
@ 2022-05-07 20:00           ` Yuan Fu
  2022-05-07 20:12             ` Theodor Thornhill
  0 siblings, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-05-07 20:00 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: emacs-devel

>> 
>> That’s too much magic to ask from tree-sitter.
> 
> I didn't expect tree-sitter to do this for us, I thought the code
> which integrates it into Emacs should do it, and hoped that code was
> already written.

Ah, I see, that would require people more competent than me on each of the languages and major modes to try using tree-sitter feature for major modes, and maybe raising suggestions and concerns along the way, like Theodor has done. I’ll try my best to respond and improve tree-sitter integration as those suggestions and concerns arise. Currently Theodor’s comments are still in my todo list. I’ll get to it soon™.

> 
> Don't be mistaken: that I thought the situation was different is
> entirely my fault, not yours.  Thank you for the work you have done
> that brought us where we are today.

Glad I can give something back to Emacs ;-)

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-07 19:11       ` Yuan Fu
@ 2022-05-07 19:25         ` Eli Zaretskii
  2022-05-07 20:00           ` Yuan Fu
  0 siblings, 1 reply; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-07 19:25 UTC (permalink / raw)
  To: Yuan Fu; +Cc: emacs-devel

> From: Yuan Fu <casouri@gmail.com>
> Date: Sat, 7 May 2022 12:11:06 -0700
> Cc: emacs-devel@gnu.org
> 
> > However, I hoped that the font-lock and indentation are already
> > integrated with tree-sitter, and all that is needed for a given
> > major-mode to use it is to flip some variable.  I didn't expect to
> > need a completely separate major mode for C sources, for example.
> > 
> > It sounds like we are farther from the goal that I thought we were,
> > and some serious work is still ahead of us to get this integrated into
> > the existing major modes.
> 
> I used a separate major mode because I don’t want to touch the existing one, and this is only a simple demo. For the author of a major-mode, they only need to port their font-lock-defaults and indent function to use tree-sitter features. If you look at the definition of ts-c-mode, its roughly

But eventually, we will touch the major modes, because the goal,
AFAIU, is to have tree-sitter-supported features be part of the major
modes Emacs provides,l perhaps first as an opt-in feature, but
eventually I hope as the default behavior.

> > However, I hoped that the font-lock and indentation are already
> > integrated with tree-sitter, and all that is needed for a given
> > major-mode to use it is to flip some variable.
> 
> That’s too much magic to ask from tree-sitter.

I didn't expect tree-sitter to do this for us, I thought the code
which integrates it into Emacs should do it, and hoped that code was
already written.

Don't be mistaken: that I thought the situation was different is
entirely my fault, not yours.  Thank you for the work you have done
that brought us where we are today.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-07 19:00       ` Theodor Thornhill
@ 2022-05-07 19:21         ` Eli Zaretskii
  0 siblings, 0 replies; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-07 19:21 UTC (permalink / raw)
  To: Theodor Thornhill; +Cc: casouri, emacs-devel

> From: Theodor Thornhill <theo@thornhill.no>
> Cc: emacs-devel@gnu.org
> Date: Sat, 07 May 2022 21:00:33 +0200
> 
> >
> > Yes, thanks.
> >
> > However, I hoped that the font-lock and indentation are already
> > integrated with tree-sitter, and all that is needed for a given
> > major-mode to use it is to flip some variable.  I didn't expect to
> > need a completely separate major mode for C sources, for example.
> 
> Isn't that the case, though?  You need to define the patterns to use,
> but apart from that, there's not much to it.

I hoped those patterns and the support code was already in place, at
least for the important major mode we use frequently.

> > It sounds like we are farther from the goal that I thought we were,
> > and some serious work is still ahead of us to get this integrated into
> > the existing major modes.
> 
> There's some manual labor involved in supporting a language.

I hoped some of that was already done.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-07 18:48     ` Eli Zaretskii
  2022-05-07 19:00       ` Theodor Thornhill
@ 2022-05-07 19:11       ` Yuan Fu
  2022-05-07 19:25         ` Eli Zaretskii
  1 sibling, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-05-07 19:11 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: emacs-devel



> On May 7, 2022, at 11:48 AM, Eli Zaretskii <eliz@gnu.org> wrote:
> 
>> From: Yuan Fu <casouri@gmail.com>
>> Date: Sat, 7 May 2022 11:27:11 -0700
>> Cc: emacs-devel@gnu.org
>> 
>>> Would you please suggest how to "play with it"?  What features to turn
>>> on and how, what commands and displays to try, etc.  Posting such
>>> instructions should allow all of us get to testing this branch much
>>> sooner and easier.
>> 
>> Some suggestions on top of my head:
>> 
>> - Load the attached treesit-demo.el which contains a very basic C major mode, ts-c-mode, that gives you some font-locking and indentation using tree-sitter features.
>> - Get a C file, change its extension to .tsc, and open it
>> - The major mode should be ts-c-mode, and it should have some basic fontification.
>> - Type M-x treesit-show-buffer-tree RET to see the AST of the C file, probably want to use this on a smaller C file.
>> - Type M-x treesit-inspect-mode RET to display the AST of the node at point in the mode-line, see more explanation in its doctoring.
>> - Set treesit--indent-verbose to t, and try to indent some lines, and see which indentation rule is matched.
>> 
>> Is this the sort of things you are looking for?
> 
> Yes, thanks.
> 
> However, I hoped that the font-lock and indentation are already
> integrated with tree-sitter, and all that is needed for a given
> major-mode to use it is to flip some variable.  I didn't expect to
> need a completely separate major mode for C sources, for example.
> 
> It sounds like we are farther from the goal that I thought we were,
> and some serious work is still ahead of us to get this integrated into
> the existing major modes.

I used a separate major mode because I don’t want to touch the existing one, and this is only a simple demo. For the author of a major-mode, they only need to port their font-lock-defaults and indent function to use tree-sitter features. If you look at the definition of ts-c-mode, its roughly

(if (treesit-should-enable-p)
    (set tree-sitter font-lock and indent variables)
  (set font-lock-defaults etc))

Then a user could choose whether to turn on tree-sitter for a particular major mode by changing treesit-disabled-modes. Maybe it should be treesit-enabled-modes, or something more elaborate, but that’s another topic.

Of course, if a major-mode author pleases, they could also define separate major modes, as Theodor pointed out.

> However, I hoped that the font-lock and indentation are already
> integrated with tree-sitter, and all that is needed for a given
> major-mode to use it is to flip some variable.

That’s too much magic to ask from tree-sitter. It is just an incremental-parser, work is still needed from major mode authors to use the parsed AST to font-lock/indent/etc. But at least I’ve made that process as easy as possible.

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-07 18:33     ` Yuan Fu
@ 2022-05-07 19:02       ` Theodor Thornhill
  0 siblings, 0 replies; 187+ messages in thread
From: Theodor Thornhill @ 2022-05-07 19:02 UTC (permalink / raw)
  To: Yuan Fu; +Cc: Eli Zaretskii, emacs-devel


> So now 'tree-sitter-c should be 'c.
>

Right, I discovered that quickly :)

>> 
>> To make this work you need the definition compiled and put on some path
>> that emacs can see.  The suggested one is LD_LIBRARY_PATH, and I start
>> emacs using something like this:
>> 
>> ```
>> LD_LIBRARY_PATH=/home/theo/src/tree-sitter-module/dist /path/to/tree-sitter-enabled-emacs/src/emacs
>> ```
>
> Now you can use tree-sitter-extra-load-path
>

Thanks!

>
> Now you should use treesit- prefix.
>

Yeah - thanks :)

Theodor



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-07 18:48     ` Eli Zaretskii
@ 2022-05-07 19:00       ` Theodor Thornhill
  2022-05-07 19:21         ` Eli Zaretskii
  2022-05-07 19:11       ` Yuan Fu
  1 sibling, 1 reply; 187+ messages in thread
From: Theodor Thornhill @ 2022-05-07 19:00 UTC (permalink / raw)
  To: Eli Zaretskii, Yuan Fu; +Cc: emacs-devel

>
> Yes, thanks.
>
> However, I hoped that the font-lock and indentation are already
> integrated with tree-sitter, and all that is needed for a given
> major-mode to use it is to flip some variable.  I didn't expect to
> need a completely separate major mode for C sources, for example.

Isn't that the case, though?  You need to define the patterns to use,
but apart from that, there's not much to it.  I don't believe
tree-sitter itself is opinionated on how saturated the font-locking
should be.  As for definining completely separate major modes, I
personally think that's some of the point.  All of CC Mode functionality
can be replaced by the equivalent tree-sitter one.  Whether or not that
should be its own mode or behind some defcustom is up for discussion.

>
> It sounds like we are farther from the goal that I thought we were,
> and some serious work is still ahead of us to get this integrated into
> the existing major modes.

There's some manual labor involved in supporting a language.  Look at
how neovim does it for an alternative:
https://github.com/nvim-treesitter/nvim-treesitter

This is a collection of queries and highlights, as well as some
functionality to download the proper definitions.

Theodor



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-07 18:27   ` Yuan Fu
@ 2022-05-07 18:48     ` Eli Zaretskii
  2022-05-07 19:00       ` Theodor Thornhill
  2022-05-07 19:11       ` Yuan Fu
  0 siblings, 2 replies; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-07 18:48 UTC (permalink / raw)
  To: Yuan Fu; +Cc: emacs-devel

> From: Yuan Fu <casouri@gmail.com>
> Date: Sat, 7 May 2022 11:27:11 -0700
> Cc: emacs-devel@gnu.org
> 
> > Would you please suggest how to "play with it"?  What features to turn
> > on and how, what commands and displays to try, etc.  Posting such
> > instructions should allow all of us get to testing this branch much
> > sooner and easier.
> 
> Some suggestions on top of my head:
> 
> - Load the attached treesit-demo.el which contains a very basic C major mode, ts-c-mode, that gives you some font-locking and indentation using tree-sitter features.
> - Get a C file, change its extension to .tsc, and open it
> - The major mode should be ts-c-mode, and it should have some basic fontification.
> - Type M-x treesit-show-buffer-tree RET to see the AST of the C file, probably want to use this on a smaller C file.
> - Type M-x treesit-inspect-mode RET to display the AST of the node at point in the mode-line, see more explanation in its doctoring.
> - Set treesit--indent-verbose to t, and try to indent some lines, and see which indentation rule is matched.
> 
> Is this the sort of things you are looking for?

Yes, thanks.

However, I hoped that the font-lock and indentation are already
integrated with tree-sitter, and all that is needed for a given
major-mode to use it is to flip some variable.  I didn't expect to
need a completely separate major mode for C sources, for example.

It sounds like we are farther from the goal that I thought we were,
and some serious work is still ahead of us to get this integrated into
the existing major modes.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-07  9:34   ` Theodor Thornhill
@ 2022-05-07 18:33     ` Yuan Fu
  2022-05-07 19:02       ` Theodor Thornhill
  0 siblings, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-05-07 18:33 UTC (permalink / raw)
  To: Theodor Thornhill; +Cc: Eli Zaretskii, emacs-devel

I forgot to mention the changes I made prior to pushing:
- Changed prefix to treesit-
- Now language definitions are referred to as 'c and 'python, rather than 'tree-sitter-c and 'tree-sitter-python (because we changed the prefix)
- New variable treesit-extra-load-path that lets the user use different load-paths for language definitions

These are the changes we agreed on in the old thread. I also added a new indentation preset parent-bol, as suggested by Theodor.

> The easiest way to play with it is to use some provided major mode that
> implements it.  I've made one here for golang, for reference:
> https://git.sr.ht/~theo/dotfiles/tree/master/item/emacs/.emacs.d/lisp/go-mode.el.
> 
> This can easily be modified to use 'tree-sitter-c instead

So now 'tree-sitter-c should be 'c.

> 
> To make this work you need the definition compiled and put on some path
> that emacs can see.  The suggested one is LD_LIBRARY_PATH, and I start
> emacs using something like this:
> 
> ```
> LD_LIBRARY_PATH=/home/theo/src/tree-sitter-module/dist /path/to/tree-sitter-enabled-emacs/src/emacs
> ```

Now you can use tree-sitter-extra-load-path

> 
> This is using the scripts provided by Yuan Fu in the tree-sitter-modules
> repo.  When the major mode is enabled, you can start querying the
> document by using the provided functions, such as:
> 
> ```elisp
> (tree-sitter-node-start
>  (tree-sitter-node-at (point) (point) 'tree-sitter-go))
> ```
> 
> `M-x tree-sitter-inspect-mode` shows what node you are on in the mode
> line.  It helps with debugging.
> 
> By 'play with it', I guess we can implement folding, highlighting,
> navigation, simple rename functionality, Imenu etc etc.

Now you should use treesit- prefix.

Thanks,
Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-07  9:04 ` Eli Zaretskii
  2022-05-07  9:34   ` Theodor Thornhill
@ 2022-05-07 18:27   ` Yuan Fu
  2022-05-07 18:48     ` Eli Zaretskii
  1 sibling, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-05-07 18:27 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: emacs-devel

> 
>> - play with it
> 
> Would you please suggest how to "play with it"?  What features to turn
> on and how, what commands and displays to try, etc.  Posting such
> instructions should allow all of us get to testing this branch much
> sooner and easier.

Some suggestions on top of my head:

- Load the attached treesit-demo.el which contains a very basic C major mode, ts-c-mode, that gives you some font-locking and indentation using tree-sitter features.
- Get a C file, change its extension to .tsc, and open it
- The major mode should be ts-c-mode, and it should have some basic fontification.
- Type M-x treesit-show-buffer-tree RET to see the AST of the C file, probably want to use this on a smaller C file.
- Type M-x treesit-inspect-mode RET to display the AST of the node at point in the mode-line, see more explanation in its doctoring.
- Set treesit--indent-verbose to t, and try to indent some lines, and see which indentation rule is matched.

Is this the sort of things you are looking for? Or you are thinking about testing the functionalities of tree-sitter? For that I can think of:
- Putting language definitions in different places and see Emacs can find them correctly (~/.emacs.d/tree-sitter, path in treesit-extra-load-path, LD_LOAD_PATH, etc)
- Running tests in tests/src/treesit-tests.el
- Use narrow and widen and see if everything works
- etc

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-07 17:59   ` Yuan Fu
@ 2022-05-07 18:16     ` Theodor Thornhill
  0 siblings, 0 replies; 187+ messages in thread
From: Theodor Thornhill @ 2022-05-07 18:16 UTC (permalink / raw)
  To: Yuan Fu; +Cc: Emacs Devel


> Pushed, hopefully I got it right ;-)

Thanks, that's perfect!



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-07  8:47 ` Theodor Thornhill
@ 2022-05-07 17:59   ` Yuan Fu
  2022-05-07 18:16     ` Theodor Thornhill
  0 siblings, 1 reply; 187+ messages in thread
From: Yuan Fu @ 2022-05-07 17:59 UTC (permalink / raw)
  To: Theodor Thornhill; +Cc: Emacs Devel



> On May 7, 2022, at 1:47 AM, Theodor Thornhill <theo@thornhill.no> wrote:
> 
> 
> I’ve pushed the tree-sitter integration to feature/tree-sitter. If anyone want to give it a try:
> 
> 
> Thanks! Can we merge it with some recent version of master? I seem to remember there were some eldoc issues unrelated to tree sitter no longer present on master. I don't have permissions to do it, so if you don't have time Yuan, maybe someone else can do it? 

Pushed, hopefully I got it right ;-)

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-07  9:04 ` Eli Zaretskii
@ 2022-05-07  9:34   ` Theodor Thornhill
  2022-05-07 18:33     ` Yuan Fu
  2022-05-07 18:27   ` Yuan Fu
  1 sibling, 1 reply; 187+ messages in thread
From: Theodor Thornhill @ 2022-05-07  9:34 UTC (permalink / raw)
  To: Eli Zaretskii, Yuan Fu; +Cc: emacs-devel


> Thanks!
>
>> - play with it
>
> Would you please suggest how to "play with it"?  What features to turn
> on and how, what commands and displays to try, etc.  Posting such
> instructions should allow all of us get to testing this branch much
> sooner and easier.

The easiest way to play with it is to use some provided major mode that
implements it.  I've made one here for golang, for reference:
https://git.sr.ht/~theo/dotfiles/tree/master/item/emacs/.emacs.d/lisp/go-mode.el.

This can easily be modified to use 'tree-sitter-c instead

To make this work you need the definition compiled and put on some path
that emacs can see.  The suggested one is LD_LIBRARY_PATH, and I start
emacs using something like this:

```
LD_LIBRARY_PATH=/home/theo/src/tree-sitter-module/dist /path/to/tree-sitter-enabled-emacs/src/emacs
```

This is using the scripts provided by Yuan Fu in the tree-sitter-modules
repo.  When the major mode is enabled, you can start querying the
document by using the provided functions, such as:

```elisp
(tree-sitter-node-start
  (tree-sitter-node-at (point) (point) 'tree-sitter-go))
```

`M-x tree-sitter-inspect-mode` shows what node you are on in the mode
line.  It helps with debugging.

By 'play with it', I guess we can implement folding, highlighting,
navigation, simple rename functionality, Imenu etc etc.

Hope this helps a little!

Theodor Thornhill



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-07  8:29 Yuan Fu
  2022-05-07  8:44 ` Yuan Fu
  2022-05-07  8:47 ` Theodor Thornhill
@ 2022-05-07  9:04 ` Eli Zaretskii
  2022-05-07  9:34   ` Theodor Thornhill
  2022-05-07 18:27   ` Yuan Fu
  2022-05-14 15:09 ` Daniel Martín
  3 siblings, 2 replies; 187+ messages in thread
From: Eli Zaretskii @ 2022-05-07  9:04 UTC (permalink / raw)
  To: Yuan Fu; +Cc: emacs-devel

> From: Yuan Fu <casouri@gmail.com>
> Date: Sat, 7 May 2022 01:29:17 -0700
> 
> I’ve pushed the tree-sitter integration to feature/tree-sitter.

Thanks!

> - play with it

Would you please suggest how to "play with it"?  What features to turn
on and how, what commands and displays to try, etc.  Posting such
instructions should allow all of us get to testing this branch much
sooner and easier.



^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-07  8:29 Yuan Fu
  2022-05-07  8:44 ` Yuan Fu
@ 2022-05-07  8:47 ` Theodor Thornhill
  2022-05-07 17:59   ` Yuan Fu
  2022-05-07  9:04 ` Eli Zaretskii
  2022-05-14 15:09 ` Daniel Martín
  3 siblings, 1 reply; 187+ messages in thread
From: Theodor Thornhill @ 2022-05-07  8:47 UTC (permalink / raw)
  To: Yuan Fu; +Cc: Emacs Devel

[-- Attachment #1: Type: text/html, Size: 799 bytes --]

^ permalink raw reply	[flat|nested] 187+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-07  8:29 Yuan Fu
@ 2022-05-07  8:44 ` Yuan Fu
  2022-05-07  8:47 ` Theodor Thornhill
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 187+ messages in thread
From: Yuan Fu @ 2022-05-07  8:44 UTC (permalink / raw)
  To: Emacs Devel



> On May 7, 2022, at 1:29 AM, Yuan Fu <casouri@gmail.com> wrote:
> 
> Guys,
> 
> I’ve pushed the tree-sitter integration to feature/tree-sitter. If anyone want to give it a try:
> - get tree-sitter from GitHub (or your package manager, make sure the version is at least 0.20.2)
> https://github.com/tree-sitter/tree-sitter
> - pull and build the branch
> - read (elisp)Parsing Program Source
> - grab language definitions from https://github.com/casouri/tree-sitter-module
> You can either build with my script, or download the prebuilt ones
> - play with it

BTW, I would appreciate for someone to look at the manual and maybe touch up a bit, as I’m not a native speaker and might write something not very idiomatic/fluent. 

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

* Tree-sitter integration on feature/tree-sitter
@ 2022-05-07  8:29 Yuan Fu
  2022-05-07  8:44 ` Yuan Fu
                   ` (3 more replies)
  0 siblings, 4 replies; 187+ messages in thread
From: Yuan Fu @ 2022-05-07  8:29 UTC (permalink / raw)
  To: Emacs Devel

Guys,

I’ve pushed the tree-sitter integration to feature/tree-sitter. If anyone want to give it a try:
- get tree-sitter from GitHub (or your package manager, make sure the version is at least 0.20.2)
https://github.com/tree-sitter/tree-sitter
- pull and build the branch
- read (elisp)Parsing Program Source
- grab language definitions from https://github.com/casouri/tree-sitter-module
You can either build with my script, or download the prebuilt ones
- play with it

Also apologize in advance for that my response might be slow until like June :-)

Yuan


^ permalink raw reply	[flat|nested] 187+ messages in thread

end of thread, other threads:[~2022-06-30 14:37 UTC | newest]

Thread overview: 187+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-19  1:35 Tree-sitter integration on feature/tree-sitter Kiong-Ge Liau
2022-05-20  2:01 ` Yuan Fu
2022-06-16 19:03   ` Yuan Fu
2022-06-16 19:25     ` [External] : " Drew Adams
2022-06-17  1:11       ` Yuan Fu
2022-06-17 14:22         ` Drew Adams
2022-06-17  1:24     ` Po Lu
2022-06-18  0:09       ` Yuan Fu
2022-06-17  2:00     ` Ihor Radchenko
2022-06-17  2:25       ` Lower-level change hook immune to with-silent-modifications Yuan Fu
2022-06-17  2:55         ` Stefan Monnier
2022-06-17  5:28           ` Eli Zaretskii
2022-06-17 10:10             ` Ihor Radchenko
2022-06-17 11:03               ` Eli Zaretskii
2022-06-17  5:23       ` Tree-sitter integration on feature/tree-sitter Eli Zaretskii
2022-06-17 10:40         ` Ihor Radchenko
2022-06-17 11:42           ` Exposing buffer text modifications to Lisp (was: Tree-sitter integration on feature/tree-sitter) Eli Zaretskii
2022-06-18  5:52             ` Ihor Radchenko
2022-06-18  7:01               ` Eli Zaretskii
2022-06-18  7:23                 ` Ihor Radchenko
2022-06-18  7:44                   ` Eli Zaretskii
2022-06-18  8:13                     ` Ihor Radchenko
2022-06-18  8:47                       ` Exposing buffer text modifications to Lisp Eli Zaretskii
2022-06-20 11:58                         ` Ihor Radchenko
2022-06-20 12:32                           ` Eli Zaretskii
2022-06-20 14:14                             ` Stefan Kangas
2022-06-21  3:56                               ` Ihor Radchenko
2022-06-21  4:36                             ` Ihor Radchenko
2022-06-21 12:27                               ` Eli Zaretskii
2022-06-25  4:47                                 ` Optimizing performance of buffer markers (was: Exposing buffer text modifications to Lisp) Ihor Radchenko
2022-06-25  8:29                                   ` Optimizing performance of buffer markers Stefan Monnier
2022-06-25  8:44                                     ` Eli Zaretskii
2022-06-25  9:07                                       ` Stefan Monnier
2022-06-25  9:20                                         ` Eli Zaretskii
2022-06-25  9:27                                           ` Stefan Monnier
2022-06-25  9:47                                         ` Ihor Radchenko
2022-06-25  9:53                                           ` Stefan Monnier
2022-06-26 10:32                                   ` Robert Pluim
2022-06-22 15:45                             ` Exposing buffer text modifications to Lisp Basil L. Contovounesios
2022-06-22 16:13                               ` Eli Zaretskii
2022-06-25  4:54                                 ` Ihor Radchenko
2022-06-25  5:46                                   ` Eli Zaretskii
2022-06-29 12:24                                     ` Ihor Radchenko
2022-06-20 14:33                           ` Alan Mackenzie
2022-06-21  3:58                             ` Ihor Radchenko
2022-06-17  6:15     ` Tree-sitter integration on feature/tree-sitter Eli Zaretskii
2022-06-17  7:17       ` Yuan Fu
2022-06-17 10:37         ` Eli Zaretskii
2022-06-18  0:14           ` Yuan Fu
2022-06-18  6:22             ` Eli Zaretskii
2022-06-18  8:25               ` Yuan Fu
2022-06-18  8:50                 ` Eli Zaretskii
2022-06-18 20:07                   ` Yuan Fu
2022-06-19  5:39                     ` Eli Zaretskii
2022-06-20  3:00                       ` Yuan Fu
2022-06-20 11:44                         ` Eli Zaretskii
2022-06-20 20:01                           ` Yuan Fu
2022-06-21  2:26                             ` Eli Zaretskii
2022-06-21  4:39                               ` Yuan Fu
2022-06-21 10:18                                 ` Eli Zaretskii
2022-06-22  0:34                                   ` Yuan Fu
2022-06-17 11:06     ` Jostein Kjønigsen
2022-06-18  0:28       ` Yuan Fu
2022-06-18 20:57         ` Jostein Kjønigsen
  -- strict thread matches above, loose matches on Subject: below --
2022-06-29 16:51 Abin Simon
2022-06-29 17:43 ` Yoav Marco
2022-06-30 11:21   ` Yoav Marco
2022-06-30 14:29     ` Abin Simon
2022-06-30 14:37       ` Yoav Marco
2022-06-28 16:08 Yoav Marco
2022-06-28 19:35 ` Yoav Marco
2022-06-29 15:35   ` Yuan Fu
2022-05-19  1:35 Kiong-Ge Liau
2022-05-09 17:50 Yoav Marco
2022-05-09 20:51 ` Yuan Fu
     [not found]   ` <87lev9wyll.fsf@gmail.com>
2022-05-10 15:20     ` Yoav Marco
2022-05-10 15:43   ` Yoav Marco
2022-05-10 17:54     ` Yuan Fu
2022-05-10 18:18       ` Yoav Marco
2022-05-10 19:58         ` Stefan Monnier
2022-05-10 23:11           ` Yuan Fu
2022-05-10 23:53             ` Yuan Fu
2022-05-11 11:10         ` Eli Zaretskii
2022-05-11 11:16           ` Yoav Marco
2022-05-11 14:20             ` Eli Zaretskii
2022-05-11 15:40               ` Yoav Marco
2022-05-11 16:27                 ` Eli Zaretskii
2022-05-11 20:14                   ` Yuan Fu
2022-05-11 20:25                     ` Yuan Fu
2022-05-12  5:19                       ` Eli Zaretskii
2022-05-12  6:10                         ` Yuan Fu
2022-05-12  7:12                           ` Eli Zaretskii
2022-05-12 15:18                         ` Stefan Monnier
2022-05-12 15:53                           ` Eli Zaretskii
2022-05-12  5:17                     ` Eli Zaretskii
2022-05-12  6:07                       ` Yuan Fu
2022-05-12 14:16                       ` Yoav Marco
2022-05-12 16:04                         ` Eli Zaretskii
2022-05-12 16:26                           ` Yoav Marco
2022-05-12 17:18                             ` Eli Zaretskii
2022-05-12 17:22                               ` Yoav Marco
2022-05-13  6:34                                 ` Eli Zaretskii
2022-05-13  8:04                                   ` Theodor Thornhill
2022-05-13  8:36                                     ` Yoav Marco
2022-05-13  9:46                                       ` Theodor Thornhill
2022-05-13 10:37                                     ` Eli Zaretskii
2022-05-13 10:52                                       ` Theodor Thornhill
2022-05-13  8:42                                   ` Yoav Marco
2022-05-13 10:41                                     ` Eli Zaretskii
2022-05-14  0:04                                       ` Yuan Fu
2022-06-16 19:16                                         ` Yuan Fu
2022-06-16 21:57                                           ` yoavm448
2022-06-17  1:10                                             ` Yuan Fu
2022-05-12 15:15                       ` Stefan Monnier
2022-05-15 19:20       ` chad
2022-05-15 19:26         ` Eli Zaretskii
2022-05-07  8:29 Yuan Fu
2022-05-07  8:44 ` Yuan Fu
2022-05-07  8:47 ` Theodor Thornhill
2022-05-07 17:59   ` Yuan Fu
2022-05-07 18:16     ` Theodor Thornhill
2022-05-07  9:04 ` Eli Zaretskii
2022-05-07  9:34   ` Theodor Thornhill
2022-05-07 18:33     ` Yuan Fu
2022-05-07 19:02       ` Theodor Thornhill
2022-05-07 18:27   ` Yuan Fu
2022-05-07 18:48     ` Eli Zaretskii
2022-05-07 19:00       ` Theodor Thornhill
2022-05-07 19:21         ` Eli Zaretskii
2022-05-07 19:11       ` Yuan Fu
2022-05-07 19:25         ` Eli Zaretskii
2022-05-07 20:00           ` Yuan Fu
2022-05-07 20:12             ` Theodor Thornhill
2022-05-07 21:24               ` Stefan Monnier
2022-05-07 22:02                 ` Theodor Thornhill
2022-05-08  6:18                 ` Eli Zaretskii
2022-05-08 12:05                   ` Dmitry Gutov
2022-05-08 12:16                     ` Stefan Monnier
2022-05-08 13:23                       ` Eli Zaretskii
2022-05-08 20:57                         ` Dmitry Gutov
2022-05-08 13:21                     ` Eli Zaretskii
2022-05-08 20:42                       ` Dmitry Gutov
2022-05-09 11:18                         ` Eli Zaretskii
2022-05-08  6:16               ` Eli Zaretskii
2022-05-08  6:49                 ` Theodor Thornhill
2022-05-08  6:58                   ` Eli Zaretskii
2022-05-08  9:02                     ` Theodor Thornhill
2022-05-08  9:09                       ` Theodor Thornhill
2022-05-08  9:10                       ` Eli Zaretskii
2022-05-08  9:19                         ` Theodor Thornhill
2022-05-08 10:33                           ` Eli Zaretskii
2022-05-08 13:47                             ` Theodor Thornhill
2022-05-08 13:58                               ` Eli Zaretskii
2022-05-08 14:01                               ` Stefan Monnier
2022-05-08 14:25                                 ` Theodor Thornhill
2022-05-08 14:42                                   ` Eli Zaretskii
2022-05-08 19:16                                     ` Theodor Thornhill
2022-05-08 21:14                                       ` Yuan Fu
2022-05-09 11:14                                       ` Eli Zaretskii
2022-05-09 12:20                                         ` Theodor Thornhill
2022-05-09 12:23                                           ` Eli Zaretskii
2022-05-09 21:10                                             ` Yuan Fu
2022-05-09 21:33                                               ` Theodor Thornhill
2022-05-14  0:03                                                 ` Yuan Fu
2022-05-14  5:03                                                   ` Theodor Thornhill
2022-05-14  5:13                                                     ` Yuan Fu
2022-05-17 21:45                                                       ` Theodor Thornhill
2022-05-18 20:52                                                         ` Yuan Fu
2022-05-18 21:07                                                           ` Theodor Thornhill
2022-06-16 19:09                                                             ` Yuan Fu
2022-06-17  6:19                                                               ` Eli Zaretskii
2022-06-17  7:32                                                                 ` Yuan Fu
2022-06-17 10:42                                                                   ` Eli Zaretskii
2022-06-18  0:20                                                                     ` Yuan Fu
2022-06-18  6:23                                                                       ` Eli Zaretskii
2022-06-20 14:20                                                                       ` Daniel Martín
2022-06-20 20:03                                                                         ` Yuan Fu
2022-06-17 18:12                                                                   ` Yoav Marco
2022-06-18  0:35                                                                     ` Yuan Fu
2022-06-18  8:15                                                                       ` Yoav Marco
2022-06-18 20:11                                                                         ` Yuan Fu
2022-05-08 22:42                             ` Stephen Leake
2022-05-14 15:09 ` Daniel Martín
2022-05-14 15:55   ` Yuan Fu
2022-05-14 18:50     ` Daniel Martín
2022-05-14 19:09       ` Eli Zaretskii
2022-06-16 19:10       ` Yuan Fu

Code repositories for project(s) associated with this inbox:

	https://git.savannah.gnu.org/cgit/emacs.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).