unofficial mirror of notmuch@notmuchmail.org
 help / color / mirror / code / Atom feed
From: David Bremner <david@tethera.net>
To: notmuch@notmuchmail.org
Subject: [PATCH 2/3] lib: parse index.as_text
Date: Sat,  3 Sep 2022 20:28:38 -0300	[thread overview]
Message-ID: <20220903232839.1473915-3-david@tethera.net> (raw)
In-Reply-To: <20220903232839.1473915-1-david@tethera.net>

We pre-parse into a list of compiled regular expressions to avoid
calling regexc on the hot (indexing) path.  As explained in the code
comment, this cannot be done lazily with reasonable error reporting,
at least not without touching a lot of the code in index.cc.
---
 lib/database-private.h |  4 ++++
 lib/open.cc            | 53 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)

diff --git a/lib/database-private.h b/lib/database-private.h
index b9be4e22..61232f1a 100644
--- a/lib/database-private.h
+++ b/lib/database-private.h
@@ -291,6 +291,10 @@ struct _notmuch_database {
 
     /* Track what parameters were specified when opening */
     notmuch_open_param_t params;
+
+    /* list of regular expressions to check for text indexing */
+    regex_t *index_as_text;
+    size_t index_as_text_length;
 };
 
 /* Prior to database version 3, features were implied by the database
diff --git a/lib/open.cc b/lib/open.cc
index 67ff868c..54d1faf3 100644
--- a/lib/open.cc
+++ b/lib/open.cc
@@ -320,6 +320,8 @@ _alloc_notmuch (const char *database_path, const char *config_path, const char *
     notmuch->transaction_count = 0;
     notmuch->transaction_threshold = 0;
     notmuch->view = 1;
+    notmuch->index_as_text = NULL;
+    notmuch->index_as_text_length = 0;
 
     notmuch->params = NOTMUCH_PARAM_NONE;
     if (database_path)
@@ -427,6 +429,53 @@ _load_database_state (notmuch_database_t *notmuch)
 	notmuch, notmuch->xapian_db->get_uuid ().c_str ());
 }
 
+/* XXX This should really be done lazily, but the error reporting path in the indexing code
+ * would need to be redone to report any errors.
+ */
+notmuch_status_t
+_ensure_index_as_text (notmuch_database_t *notmuch, char **message)
+{
+    int nregex = 0;
+    regex_t *regexv = NULL;
+
+    if (notmuch->index_as_text)
+	return NOTMUCH_STATUS_SUCCESS;
+
+    for (notmuch_config_values_t *list = notmuch_config_get_values (notmuch,
+								    NOTMUCH_CONFIG_INDEX_AS_TEXT);
+	 notmuch_config_values_valid (list);
+	 notmuch_config_values_move_to_next (list)) {
+	regex_t *new_regex;
+	int rerr;
+	const char *str = notmuch_config_values_get (list);
+	size_t len = strlen (str);
+
+	/* str must be non-empty, because n_c_get_values skips empty
+	 * strings */
+	assert (len > 0);
+
+	regexv = talloc_realloc (notmuch, regexv, regex_t, nregex + 1);
+	new_regex = &regexv[nregex];
+
+	rerr = regcomp (new_regex, str, REG_EXTENDED | REG_NOSUB);
+	if (rerr) {
+	    size_t error_size = regerror (rerr, new_regex, NULL, 0);
+	    char *error = (char *) talloc_size (str, error_size);
+
+	    regerror (rerr, new_regex, error, error_size);
+	    IGNORE_RESULT (asprintf (message, "Error in index.as_text: %s: %s\n", error, str));
+
+	    return NOTMUCH_STATUS_ILLEGAL_ARGUMENT;
+	}
+	nregex++;
+    }
+
+    notmuch->index_as_text = regexv;
+    notmuch->index_as_text_length = nregex;
+
+    return NOTMUCH_STATUS_SUCCESS;
+}
+
 static notmuch_status_t
 _finish_open (notmuch_database_t *notmuch,
 	      const char *profile,
@@ -531,6 +580,10 @@ _finish_open (notmuch_database_t *notmuch,
 	if (status)
 	    goto DONE;
 
+	status = _ensure_index_as_text (notmuch, &message);
+	if (status)
+	    goto DONE;
+
 	autocommit_str = notmuch_config_get (notmuch, NOTMUCH_CONFIG_AUTOCOMMIT);
 	if (unlikely (! autocommit_str)) {
 	    INTERNAL_ERROR ("missing configuration for autocommit");
-- 
2.35.2

  parent reply	other threads:[~2022-09-03 23:29 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-09-03 23:28 index user selected MIME types of attachments David Bremner
2022-09-03 23:28 ` [PATCH 1/3] lib: add config key INDEX_AS_TEXT David Bremner
2022-09-03 23:28 ` David Bremner [this message]
2022-09-03 23:28 ` [PATCH 3/3] lib: index attachements with mime types matching index.as_text David Bremner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://notmuchmail.org/

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220903232839.1473915-3-david@tethera.net \
    --to=david@tethera.net \
    --cc=notmuch@notmuchmail.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://yhetil.org/notmuch.git/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).