diff --git a/elisa.el b/elisa.el index 09cc28975d..0b79745bd5 100644 --- a/elisa.el +++ b/elisa.el @@ -28,7 +28,6 @@ ;; ELISA (Emacs Lisp Information System Assistant) is a system designed ;; to provide informative answers to user queries by leveraging a ;; Retrieval Augmented Generation (RAG) approach. -;; ;;; Code: (require 'ellama) @@ -48,68 +47,61 @@ (make-llm-ollama :embedding-model "nomic-embed-text")) "Embeddings provider to generate embeddings." - :group 'elisa - :type '(sexp :validate 'cl-struct-p)) + :type '(sexp :validate 'cl-struct-p)) ;a more specific predicate here? (defcustom elisa-chat-provider (progn (require 'llm-ollama) (make-llm-ollama :chat-model "sskostyaev/openchat:8k-rag" :embedding-model "nomic-embed-text")) "Chat provider." - :group 'elisa :type '(sexp :validate 'cl-struct-p)) (defcustom elisa-db-directory (file-truename (file-name-concat user-emacs-directory "elisa")) "Directory for elisa database." - :group 'elisa - :type 'directory) + :type 'directory) ;is it necessary that it exists? (defcustom elisa-limit 5 "Count quotes to pass into llm context for answer." - :group 'elisa - :type 'integer) + :type 'integer) ;or natnum? -(defcustom elisa-find-executable "find" +(defcustom elisa-find-executable find-program "Path to find executable." - :group 'elisa :type 'string) (defcustom elisa-tar-executable "tar" "Path to tar executable." - :group 'elisa :type 'string) (defcustom elisa-sqlite-vss-version "v0.1.2" "Sqlite VSS version." - :group 'elisa :type 'string) (defcustom elisa-sqlite-vss-path nil "Path to sqlite-vss extension." - :group 'elisa :type 'file) (defcustom elisa-sqlite-vector-path nil "Path to sqlite-vector extension." - :group 'elisa :type 'file) -(defcustom elisa-semantic-split-function 'elisa-split-by-paragraph +(defcustom elisa-semantic-split-function #'elisa-split-by-paragraph "Function for semantic text split." - :group 'elisa :type 'function) (defcustom elisa-prompt-rewriting-enabled t "Enable prompt rewriting for better retrieving." - :group 'elisa :type 'boolean) -(defcustom elisa-chat-prompt-template "Answer user query based on context above. If you can answer it partially do it. Provide list of open questions if any. Say \"not enough data\" if you can't answer user query based on provided context. User query: +(defcustom elisa-chat-prompt-template + "Answer user query based on context above. \ +If you can answer it partially do it. \ +Provide list of open questions if any. \ +Say \"not enough data\" if you can't answer user \ +query based on provided context. User query: %s" - "Chat prompt template." - :group 'elisa + "Chat prompt template." ;some more explanation would be nice. :type 'string) (defcustom elisa-rewrite-prompt-template @@ -130,81 +122,70 @@ How to buy a pony? User prompt: %s" "Prompt template for prompt rewriting." - :group 'elisa :type 'string) (defcustom elisa-searxng-url "http://localhost:8080/" "Searxng url for web search. Json format should be enabled for this instance." - :group 'elisa :type 'string) (defcustom elisa-pandoc-executable "pandoc" - "Path to pandoc executable." - :group 'elisa + "Path to pandoc (https://pandoc.org/) executable." :type 'string) -(defcustom elisa-webpage-extraction-function 'elisa-get-webpage-buffer +(defcustom elisa-webpage-extraction-function #'elisa-get-webpage-buffer "Function to get buffer with webpage content." - :group 'elisa :type 'function) -(defcustom elisa-web-search-function 'elisa-search-duckduckgo +(defcustom elisa-web-search-function #'elisa-search-duckduckgo "Function to search the web. Function should get prompt and return list of urls." - :group 'elisa :type 'function) (defcustom elisa-web-pages-limit 10 "Limit of web pages to parse during web search." - :group 'elisa - :type 'integer) + :type 'natnum) (defcustom elisa-breakpoint-threshold-amount 0.4 "Breakpoint threshold amount. Increase it if you need decrease semantic split granularity." - :group 'elisa - :type 'float) + :type 'number) (defcustom elisa-reranker-enabled nil "Enable reranker to improve retrieving quality." - :group 'elisa :type 'boolean) (defcustom elisa-reranker-url "http://127.0.0.1:8787/" - "Reranker service url." - :group 'elisa + "Reranker service url." ;here as well, this doesn't mean much to someone who don't already know what is going on. :type 'string) (defcustom elisa-reranker-similarity-threshold 0 "Reranker similarity threshold. If set, all quotes with similarity less than threshold will be filtered out." - :group 'elisa - :type 'string) + :type 'string) ;wrong type? (defcustom elisa-reranker-limit 20 "Number of quotes for send to reranker." - :group 'elisa :type 'integer) (defcustom elisa-ignore-patterns-files '(".gitignore" ".ignore" ".rgignore") "Files with patterns to ignore during file parsing." - :group 'elisa - :type '(list string)) + :type '(repeat string)) (defcustom elisa-ignore-invisible-files t "Ignore invisible files and directories during file parsing." - :group 'elisa :type 'boolean) (defcustom elisa-enabled-collections '("builtin manuals" "external manuals") "Enabled collections for elisa chat." - :group 'elisa - :type '(list string)) + :type '(repeat string)) (defun elisa-sqlite-vss-download-url () + ;; It seems to be a general problem that your documentation strings + ;; are not giving any context, and just barley touching on what is + ;; going on... "Generate sqlite vss download url based on current system." - (cond ((string-equal system-type "darwin") - (if (string-prefix-p "aarch64" system-configuration) + (cond ((eq system-type 'darwin) + (if (string-prefix-p "aarch64" system-configuration) ;how robust is this? (format "https://github.com/asg017/sqlite-vss/releases/download/%s/sqlite-vss-%s-loadable-macos-aarch64.tar.gz" elisa-sqlite-vss-version @@ -213,7 +194,7 @@ If set, all quotes with similarity less than threshold will be filtered out." "https://github.com/asg017/sqlite-vss/releases/download/%s/sqlite-vss-%s-loadable-macos-x86_64.tar.gz" elisa-sqlite-vss-version elisa-sqlite-vss-version))) - ((string-equal system-type "gnu/linux") + ((eq system-type 'gnu/linux) (format "https://github.com/asg017/sqlite-vss/releases/download/%s/sqlite-vss-%s-loadable-linux-x86_64.tar.gz" elisa-sqlite-vss-version @@ -223,18 +204,14 @@ If set, all quotes with similarity less than threshold will be filtered out." (defun elisa--vss-path () "Path to vss sqlite extension." (or elisa-sqlite-vss-path - (let* ((ext (if (string-equal system-type "darwin") - "dylib" - "so")) + (let* ((ext (if (eq system-type 'darwin) "dylib" "so")) (file (format "vss0.%s" ext))) (file-name-concat elisa-db-directory file)))) (defun elisa--vector-path () "Path to vector sqlite extension." (or elisa-sqlite-vector-path - (let* ((ext (if (string-equal system-type "darwin") - "dylib" - "so")) + (let* ((ext (if (string-equal system-type 'darwin) "dylib" "so")) (file (format "vector0.%s" ext))) (file-name-concat elisa-db-directory file)))) @@ -260,45 +237,45 @@ If set, all quotes with similarity less than threshold will be filtered out." (defun elisa-embeddings-create-table-sql () "Generate sql for create embeddings table." - "drop table if exists elisa_embeddings;") + "DROP TABLE IF EXISTS elisa_embeddings;") ;just my personal taste, ignore if you disagree (i do it because there is no sql syntax highlighting in elisp strings) (defun elisa-data-embeddings-create-table-sql () "Generate sql for create data embeddings table." - (format "create virtual table if not exists data_embeddings using vss0(embedding(%d));" + (format "CREATE VIRTUAL TABLE IF NOT EXISTS data_embeddings USING vss0(embedding(%d));" (elisa-get-embedding-size))) (defun elisa-data-fts-create-table-sql () "Generate sql for create full text search table." - "create virtual table if not exists data_fts using fts5(data);") + "CREATE VIRTUAL TABLE IF NOT EXISTS data_fts USING FTS5(data);") (defun elisa-info-create-table-sql () "Generate sql for create info table." - "drop table if exists info;") + "DROP TABLE IF EXISTS info;") (defun elisa-collections-create-table-sql () "Generate sql for create collections table." - "create table if not exists collections (name text unique);") + "CREATE TABLE IF NOT EXISTS collections (name TEXT UNIQUE);") (defun elisa-kinds-create-table-sql () "Generate sql for create kinds table." - "create table if not exists kinds (name text unique);") + "CREATE TABLE IF NOT EXISTS kinds (name TEXT UNIQUE);") (defun elisa-fill-kinds-sql () "Generate sql for fill kinds table." - "insert into kinds (name) values ('web'), ('file'), ('info') on conflict do nothing;") + "INSERT INTO KINDS (name) VALUES ('web'), ('file'), ('info') ON CONFLICT DO NOTHING;") (defun elisa-files-create-table-sql () "Generate sql for create files table." - "create table if not exists files (path text unique, hash text)") + "CREATE TABLE IF NOT EXISTS files (path TEXT UNIQUE, hash TEXT)") (defun elisa-data-create-table-sql () "Generate sql for create data table." - "create table if not exists data ( + "CREATE TABLE IF NOT EXISTS data ( kind_id INTEGER, collection_id INTEGER, -path text, -hash text, -data text, +path TEXT, +hash TEXT, +data TEXT, FOREIGN KEY(kind_id) REFERENCES kinds(rowid), FOREIGN KEY(collection_id) REFERENCES collections(rowid) );") @@ -307,13 +284,9 @@ FOREIGN KEY(collection_id) REFERENCES collections(rowid) "Initialize elisa DB." (if (not (file-exists-p (elisa--vss-path))) (warn "Please run M-x `elisa-download-sqlite-vss' to use this package") - (sqlite-pragma db "PRAGMA journal_mode=WAL;") - (sqlite-load-extension - db - (elisa--vector-path)) - (sqlite-load-extension - db - (elisa--vss-path)) + (sqlite-pragma db "PRAGMA journal_mode=WALL;") + (sqlite-load-extension db (elisa--vector-path)) + (sqlite-load-extension db (elisa--vss-path)) (sqlite-execute db (elisa-embeddings-create-table-sql)) (sqlite-execute db (elisa-info-create-table-sql)) (sqlite-execute db (elisa-collections-create-table-sql)) @@ -324,44 +297,46 @@ FOREIGN KEY(collection_id) REFERENCES collections(rowid) (sqlite-execute db (elisa-data-embeddings-create-table-sql)) (sqlite-execute db (elisa-data-fts-create-table-sql)))) -(defvar elisa-db (progn - (make-directory elisa-db-directory t) - (let ((db (sqlite-open (file-name-concat elisa-db-directory "elisa.sqlite")))) - (elisa--init-db db) - db))) +(defvar elisa-db + (let ((_ (make-directory elisa-db-directory t)) + (db (sqlite-open (file-name-concat elisa-db-directory "elisa.sqlite")))) + (elisa--init-db db) + db)) (defun elisa-vector-to-sqlite (data) "Convert DATA to sqlite vector representation." - (format "vector_from_json(json('%s'))" - (json-encode data))) - -(defun elisa-sqlite-escape (s) - "Escape single quotes in S for sqlite." - (thread-last - s - (string-replace "'" "''") - (string-replace "\\" "\\\\") - (string-replace "\0" "\n"))) + (format "vector_from_json(json('%s'))" (json-encode data))) + +(defun elisa-sqlite-escape (string) + "Escape single quotes in STRING for sqlite." + (let ((reps '(("'" . "''") + ("\\" . "\\\\") + ("\0" . "\n")))) + (replace-regexp-in-string ;simultanious replacement + (regexp-opt (mapcar #'car reps)) ;is the last one really \0 or \\0? + (lambda (str) (alist-get str reps nil nil #'string=)) + string nil t))) (defun elisa-sqlite-format-int-list (ids) "Convert list of integer IDS list to sqlite list representation." (format "(%s)" - (string-join (mapcar (lambda (id) (format "%d" id)) ids) ", "))) + (mapconcat (lambda (id) (format "%d" id)) ids ", "))) (defun elisa-sqlite-format-string-list (names) "Convert list of string NAMES list to sqlite list representation." (format "(%s)" - (string-join (mapcar (lambda (name) - (format "'%s'" - (elisa-sqlite-escape name))) names) ", "))) + (mapconcat (lambda (name) + (format "'%s'" + (elisa-sqlite-escape name))) + names ", "))) -(defun elisa-avg (lst) - "Calculate arithmetic average value of LST." - (let ((len (length lst)) - (sum (cl-reduce #'+ lst :initial-value 0.0))) - (/ sum len))) +(defun elisa-avg (list) + "Calculate arithmetic average value of LIST." + (cl-loop for elem in list for count from 0 + summing elem into sum + finally (return (/ sum (float count))))) (defun elisa-std-dev (lst) "Calculate standart deviation value of LST." @@ -450,6 +425,11 @@ FOREIGN KEY(collection_id) REFERENCES collections(rowid) (setq continue nil)))) (setq continue nil)))))))) +;; this is pretty verbose, do you think there might be a more readable +;; way to express this? It is pretty easy to create a small +;; templating language in Elisp +;; (e.g. https://git.savannah.gnu.org/cgit/emacs/elpa.git/tree/elpa-admin.el?h=elpa-admin#n1138), +;; perhaps that could also be useful here. (defun elisa--find-similar (text collections) "Find similar to TEXT results in COLLECTIONS. Return sqlite query. For asyncronous execution." @@ -520,7 +500,7 @@ Evaluate ON-DONE with result." (result nil)) (save-excursion (goto-char (point-min)) - (while (< (point) (point-max)) + (while (not (eobp)) (funcall func) (push (buffer-substring-no-properties pt (point)) result) (setq pt (point))) @@ -534,6 +514,7 @@ Evaluate ON-DONE with result." "Split buffer to list of paragraphs." (elisa--split-by #'forward-paragraph)) +;; a number of these functions seem like something that should be added to the core of Emacs or at least a common ELPA package... (defun elisa-dot-product (v1 v2) "Calculate the dot produce of vectors V1 and V2." (let ((result 0)) @@ -608,14 +589,12 @@ than T, it will be packed into single semantic chunk." (current (car chunks)) (tail (cdr chunks))) (let* ((result nil)) - (mapc - (lambda (el) - (if (<= el threshold) + (dolist (el distances) + (if (<= el threshold) (setq current (concat current (car tail))) (push current result) (setq current (car tail))) (setq tail (cdr tail))) - distances) (push current result) (cl-remove-if #'string-empty-p @@ -626,6 +605,7 @@ than T, it will be packed into single semantic chunk." (nreverse result)))) (list (buffer-substring-no-properties (point-min) (point-max))))) +;; why not use `wildcard-to-regexp'? (defun elisa--gitignore-to-elisp-regexp (pattern) "Convert a .gitignore PATTERN to an Emacs Lisp regexp." (let ((result "") @@ -676,11 +656,11 @@ than T, it will be packed into single semantic chunk." (defun elisa--text-file-p (filename) "Check if FILENAME contain text." - (or (when (get-file-buffer filename) t) ;; if file opened assume it text + (or (and (get-file-buffer filename) t) ;; if file opened assume it text (with-current-buffer (find-file-noselect filename t t) (prog1 ;; if there is null byte in file, file is binary - (not (re-search-forward "\0" nil t 1)) + (not (search-forward "\0" nil t 1)) (kill-buffer))))) (defun elisa--file-list (directory) @@ -727,41 +707,39 @@ When FORCE parse even if already parsed." (format "delete from files where path = '%s';" (elisa-sqlite-escape path)))) ;; add new data - (mapc - (lambda (text) - (let* ((hash (secure-hash 'sha256 text)) - (rowid - (if-let ((rowid (caar (sqlite-select - elisa-db - (format "select rowid from data where kind_id = %s and collection_id = %s and path = '%s' and hash = '%s';" - kind-id collection-id - (elisa-sqlite-escape path) hash))))) - (progn - (push rowid row-ids) - nil) - (sqlite-execute - elisa-db - (format - "insert into data(kind_id, collection_id, path, hash, data) values (%s, %s, '%s', '%s', '%s');" - kind-id collection-id - (elisa-sqlite-escape path) hash (elisa-sqlite-escape text))) - (caar (sqlite-select - elisa-db - (format "select rowid from data where kind_id = %s and collection_id = %s and path = '%s' and hash = '%s';" - kind-id collection-id - (elisa-sqlite-escape path) hash)))))) - (when rowid - (sqlite-execute - elisa-db - (format "insert into data_embeddings(rowid, embedding) values (%s, %s);" - rowid (elisa-vector-to-sqlite - (llm-embedding elisa-embeddings-provider text)))) - (sqlite-execute - elisa-db - (format "insert into data_fts(rowid, data) values (%s, '%s');" - rowid (elisa-sqlite-escape text))) - (push rowid row-ids)))) - chunks) + (dolist (text chunks) + (let* ((hash (secure-hash 'sha256 text)) + (rowid + (if-let ((rowid (caar (sqlite-select + elisa-db + (format "select rowid from data where kind_id = %s and collection_id = %s and path = '%s' and hash = '%s';" + kind-id collection-id + (elisa-sqlite-escape path) hash))))) + (progn + (push rowid row-ids) + nil) + (sqlite-execute + elisa-db + (format + "insert into data(kind_id, collection_id, path, hash, data) values (%s, %s, '%s', '%s', '%s');" + kind-id collection-id + (elisa-sqlite-escape path) hash (elisa-sqlite-escape text))) + (caar (sqlite-select + elisa-db + (format "select rowid from data where kind_id = %s and collection_id = %s and path = '%s' and hash = '%s';" + kind-id collection-id + (elisa-sqlite-escape path) hash)))))) + (when rowid + (sqlite-execute + elisa-db + (format "insert into data_embeddings(rowid, embedding) values (%s, %s);" + rowid (elisa-vector-to-sqlite + (llm-embedding elisa-embeddings-provider text)))) + (sqlite-execute + elisa-db + (format "insert into data_fts(rowid, data) values (%s, '%s');" + rowid (elisa-sqlite-escape text))) + (push rowid row-ids)))) ;; remove old data (when row-ids (let ((delete-rows (cl-remove-if (lambda (id) @@ -779,7 +757,7 @@ When FORCE parse even if already parsed." (defun elisa--delete-data (ids) "Delete data with IDS." - (sqlite-execute + (sqlite-execute ;perhaps it would be worth extracting the (sqlite-execute elisa-db ...) part into a separate function elisa-db (format "delete from data_fts where rowid in %s;" (elisa-sqlite-format-int-list ids))) @@ -815,10 +793,9 @@ When FORCE parse even if already parsed." collection-id (elisa-sqlite-format-string-list files)))))) (elisa--delete-data delete-ids) - (mapc (lambda (file) - (message "parsing %s" file) - (elisa-parse-file collection-id file)) - files))) + (dolist (file files) + (message "parsing %s" file) + (elisa-parse-file collection-id file)))) ;;;###autoload (defun elisa-async-parse-directory (dir) @@ -853,14 +830,14 @@ When FORCE parse even if already parsed." (libxml-parse-html-region (point) (point-max)) 'a)) - :test 'string-equal))))) + :test #'string-equal))))) (defun elisa-search-searxng (prompt) "Search searxng for PROMPT and return list of urls. You can customize `elisa-searxng-url' to use non local instance." (let ((url (format "%s/search?format=json&q=%s" elisa-searxng-url (url-hexify-string prompt)))) (thread-last - (plz 'get url :as 'json-read) + (plz 'get url :as 'json-read) ;I am not familiar with the "plz" library, is the `json-read' a function? (alist-get 'results) (mapcar (lambda (el) (alist-get 'url el)))))) @@ -896,14 +873,13 @@ You can customize `elisa-searxng-url' to use non local instance." (with-current-buffer buffer-name (shell-command-on-region (point-min) (point-max) - (format "%s -f html --to plain" - (executable-find elisa-pandoc-executable)) + (format "%s --from html --to plain" elisa-pandoc-executable) buffer-name t) buffer-name))) (defun elisa-fts-query (prompt) "Return fts match query for PROMPT." - (thread-last + (thread-last ;i belive you can do all of this with a single regular expression... prompt (string-trim) (downcase) @@ -938,7 +914,7 @@ You can customize `elisa-searxng-url' to use non local instance." :headers `(("Content-Type" . "application/json")) :body-type 'text :body (elisa--rerank-request prompt ids) - :as #'json-read))))) + :as #'json-read))))) ;so it is a function! (defun elisa-rerank (prompt ids) "Rerank IDS according to PROMPT and return top `elisa-limit' IDS." @@ -962,11 +938,10 @@ You can customize `elisa-searxng-url' to use non local instance." (defun elisa--parse-web-page (collection-id url) "Parse URL into collection with COLLECTION-ID." (let ((kind-id (caar (sqlite-select - elisa-db "select rowid from kinds where name = 'web';")))) - (message "collecting data from %s" url) - (mapc - (lambda (chunk) - (let* ((hash (secure-hash 'sha256 chunk)) + elisa-db "SELECT rowid FROM kinds WHERE name = 'web';")))) + (message "collecting data from %S..." url) + (dolist (chunk (elisa-extact-webpage-chunks url)) + (let* ((hash (secure-hash 'sha256 chunk)) (embedding (llm-embedding elisa-embeddings-provider chunk)) (rowid (if-let ((rowid (caar (sqlite-select @@ -989,8 +964,7 @@ You can customize `elisa-searxng-url' to use non local instance." (sqlite-execute elisa-db (format "insert into data_fts(rowid, data) values (%s, '%s');" - rowid (elisa-sqlite-escape chunk)))))) - (elisa-extact-webpage-chunks url)))) + rowid (elisa-sqlite-escape chunk)))))))) (defun elisa--web-search (prompt) "Search the web for PROMPT. @@ -1007,11 +981,10 @@ Return sqlite query that extract data for adding to context." (elisa-sqlite-escape prompt))))) (urls (funcall elisa-web-search-function prompt)) (collected-pages 0)) - (mapc (lambda (url) - (when (<= collected-pages elisa-web-pages-limit) - (elisa--parse-web-page collection-id url) - (cl-incf collected-pages))) - urls))) + (dolist (url urls) + (when (<= collected-pages elisa-web-pages-limit) + (elisa--parse-web-page collection-id url) + (cl-incf collected-pages))))) (defun elisa--rewrite-prompt (prompt action) "Rewrite PROMPT if `elisa-prompt-rewriting-enabled'. @@ -1071,7 +1044,7 @@ WHERE d.rowid in %s;" (when-let ((kind (cl-first row)) (path (cl-second row)) (text (cl-third row))) - (pcase kind + (pcase kind ;is this a `pcase-exhaustive'? ("web" (ellama-context-add-webpage-quote-noninteractive path path text)) ("file" @@ -1096,15 +1069,16 @@ WHERE d.rowid in %s;" (mapcar #'file-name-base (cl-remove-if-not - (lambda (s) (or (string-suffix-p ".info" s) - (string-suffix-p ".info.gz" s))) + (lambda (s) + (or (string-suffix-p ".info" s) + (string-suffix-p ".info.gz" s))) (directory-files (with-temp-buffer (info "emacs" (current-buffer)) (file-name-directory Info-current-file)))))) (defun elisa-get-external-manuals () "Get external manual names list." - (cl-remove-if + (cl-remove-if ;a `thread-last' might be nice here #'not (mapcar #'elisa--info-valid-p @@ -1112,7 +1086,7 @@ WHERE d.rowid in %s;" (mapcar #'file-name-base (process-lines - (executable-find elisa-find-executable) + elisa-find-executable (file-truename (file-name-concat user-emacs-directory "elpa")) "-name" "*.info")))))) @@ -1207,7 +1181,7 @@ It does nothing if buffer file not inside one of existing collections." (when-let* ((collections (flatten-tree (sqlite-select elisa-db - "select name from collections;"))) + "SELECT name FROM collections;"))) (dirs (cl-remove-if-not #'file-directory-p collections)) (file (buffer-file-name)) (collection (cl-find-if (lambda (dir) @@ -1283,7 +1257,7 @@ It does nothing if buffer file not inside one of existing collections." "Add webpage by URL to COLLECTION." (interactive (list - (if-let ((url (or (and (fboundp 'thing-at-point) (thing-at-point 'url)) + (if-let ((url (or (and (fboundp 'thing-at-point) (thing-at-point 'url)) ;why not always use `thing-at-point'? (shr-url-at-point nil)))) url (read-string "Enter URL you want to summarize: "))