unofficial mirror of notmuch@notmuchmail.org
 help / color / mirror / code / Atom feed
* index user selected MIME types of attachments
@ 2022-09-03 23:28 David Bremner
  2022-09-03 23:28 ` [PATCH 1/3] lib: add config key INDEX_AS_TEXT David Bremner
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: David Bremner @ 2022-09-03 23:28 UTC (permalink / raw)
  To: notmuch

This series obsoletes the WIP patch [1]. Most of the work is in making
it configurable.

[1]: id:20220820185007.289543-4-david@tethera.net


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH 1/3] lib: add config key INDEX_AS_TEXT
  2022-09-03 23:28 index user selected MIME types of attachments David Bremner
@ 2022-09-03 23:28 ` David Bremner
  2022-09-03 23:28 ` [PATCH 2/3] lib: parse index.as_text David Bremner
  2022-09-03 23:28 ` [PATCH 3/3] lib: index attachements with mime types matching index.as_text David Bremner
  2 siblings, 0 replies; 4+ messages in thread
From: David Bremner @ 2022-09-03 23:28 UTC (permalink / raw)
  To: notmuch

Higher level processing as a list of regular expressions and
documentation will follow.
---
 lib/config.cc            | 3 +++
 lib/notmuch.h            | 1 +
 test/T030-config.sh      | 1 +
 test/T055-path-config.sh | 1 +
 test/T590-libconfig.sh   | 5 +++++
 5 files changed, 11 insertions(+)

diff --git a/lib/config.cc b/lib/config.cc
index 503a0c8b..2323860d 100644
--- a/lib/config.cc
+++ b/lib/config.cc
@@ -599,6 +599,8 @@ _notmuch_config_key_to_string (notmuch_config_key_t key)
 	return "database.autocommit";
     case NOTMUCH_CONFIG_EXTRA_HEADERS:
 	return "show.extra_headers";
+    case NOTMUCH_CONFIG_INDEX_AS_TEXT:
+	return "index.as_text";
     default:
 	return NULL;
     }
@@ -642,6 +644,7 @@ _notmuch_config_default (notmuch_database_t *notmuch, notmuch_config_key_t key)
 	else
 	    email = _get_email_from_passwd_file (notmuch);
 	return email;
+    case NOTMUCH_CONFIG_INDEX_AS_TEXT:
     case NOTMUCH_CONFIG_NEW_IGNORE:
 	return "";
     case NOTMUCH_CONFIG_AUTOCOMMIT:
diff --git a/lib/notmuch.h b/lib/notmuch.h
index 0b0540b1..935a8d59 100644
--- a/lib/notmuch.h
+++ b/lib/notmuch.h
@@ -2558,6 +2558,7 @@ typedef enum {
     NOTMUCH_CONFIG_USER_NAME,
     NOTMUCH_CONFIG_AUTOCOMMIT,
     NOTMUCH_CONFIG_EXTRA_HEADERS,
+    NOTMUCH_CONFIG_INDEX_AS_TEXT,
     NOTMUCH_CONFIG_LAST
 } notmuch_config_key_t;
 
diff --git a/test/T030-config.sh b/test/T030-config.sh
index 43bbce31..ea0b4012 100755
--- a/test/T030-config.sh
+++ b/test/T030-config.sh
@@ -57,6 +57,7 @@ database.mail_root=MAIL_DIR
 database.path=MAIL_DIR
 foo.list=this;is another;list value;
 foo.string=this is another string value
+index.as_text=
 maildir.synchronize_flags=true
 new.ignore=
 new.tags=unread;inbox
diff --git a/test/T055-path-config.sh b/test/T055-path-config.sh
index fe295324..efc79e8b 100755
--- a/test/T055-path-config.sh
+++ b/test/T055-path-config.sh
@@ -299,6 +299,7 @@ database.backup_dir
 database.hook_dir
 database.mail_root=MAIL_DIR
 database.path
+index.as_text=
 maildir.synchronize_flags=true
 new.ignore=
 new.tags=unread;inbox
diff --git a/test/T590-libconfig.sh b/test/T590-libconfig.sh
index 26a1f033..9326ba3e 100755
--- a/test/T590-libconfig.sh
+++ b/test/T590-libconfig.sh
@@ -440,6 +440,7 @@ cat <<'EOF' >EXPECTED
 10: 'USER_FULL_NAME'
 11: '8000'
 12: 'NULL'
+13: ''
 == stderr ==
 EOF
 unset MAILDIR
@@ -725,6 +726,7 @@ test_expect_equal_file EXPECTED OUTPUT
 test_begin_subtest "list by keys (ndlc)"
 notmuch config set search.exclude_tags "foo;bar;fub"
 notmuch config set new.ignore "sekrit_junk"
+notmuch config set index.as_text "text/"
 cat c_head2 - c_tail <<'EOF' | test_C ${MAIL_DIR} %NULL% %NULL%
 {
     notmuch_config_key_t key;
@@ -751,6 +753,7 @@ cat <<'EOF' >EXPECTED
 10: 'Notmuch Test Suite'
 11: '8000'
 12: 'NULL'
+13: 'text/'
 == stderr ==
 EOF
 test_expect_equal_file EXPECTED OUTPUT
@@ -785,6 +788,7 @@ cat <<'EOF' >EXPECTED
 10: 'USER_FULL_NAME'
 11: '8000'
 12: 'NULL'
+13: ''
 == stderr ==
 EOF
 test_expect_equal_file EXPECTED OUTPUT.clean
@@ -856,6 +860,7 @@ database.backup_dir MAIL_DIR/.notmuch/backups
 database.hook_dir MAIL_DIR/.notmuch/hooks
 database.mail_root MAIL_DIR
 database.path MAIL_DIR
+index.as_text text/
 key with spaces value, with, spaces!
 maildir.synchronize_flags true
 new.ignore sekrit_junk
-- 
2.35.2

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 2/3] lib: parse index.as_text
  2022-09-03 23:28 index user selected MIME types of attachments David Bremner
  2022-09-03 23:28 ` [PATCH 1/3] lib: add config key INDEX_AS_TEXT David Bremner
@ 2022-09-03 23:28 ` David Bremner
  2022-09-03 23:28 ` [PATCH 3/3] lib: index attachements with mime types matching index.as_text David Bremner
  2 siblings, 0 replies; 4+ messages in thread
From: David Bremner @ 2022-09-03 23:28 UTC (permalink / raw)
  To: notmuch

We pre-parse into a list of compiled regular expressions to avoid
calling regexc on the hot (indexing) path.  As explained in the code
comment, this cannot be done lazily with reasonable error reporting,
at least not without touching a lot of the code in index.cc.
---
 lib/database-private.h |  4 ++++
 lib/open.cc            | 53 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)

diff --git a/lib/database-private.h b/lib/database-private.h
index b9be4e22..61232f1a 100644
--- a/lib/database-private.h
+++ b/lib/database-private.h
@@ -291,6 +291,10 @@ struct _notmuch_database {
 
     /* Track what parameters were specified when opening */
     notmuch_open_param_t params;
+
+    /* list of regular expressions to check for text indexing */
+    regex_t *index_as_text;
+    size_t index_as_text_length;
 };
 
 /* Prior to database version 3, features were implied by the database
diff --git a/lib/open.cc b/lib/open.cc
index 67ff868c..54d1faf3 100644
--- a/lib/open.cc
+++ b/lib/open.cc
@@ -320,6 +320,8 @@ _alloc_notmuch (const char *database_path, const char *config_path, const char *
     notmuch->transaction_count = 0;
     notmuch->transaction_threshold = 0;
     notmuch->view = 1;
+    notmuch->index_as_text = NULL;
+    notmuch->index_as_text_length = 0;
 
     notmuch->params = NOTMUCH_PARAM_NONE;
     if (database_path)
@@ -427,6 +429,53 @@ _load_database_state (notmuch_database_t *notmuch)
 	notmuch, notmuch->xapian_db->get_uuid ().c_str ());
 }
 
+/* XXX This should really be done lazily, but the error reporting path in the indexing code
+ * would need to be redone to report any errors.
+ */
+notmuch_status_t
+_ensure_index_as_text (notmuch_database_t *notmuch, char **message)
+{
+    int nregex = 0;
+    regex_t *regexv = NULL;
+
+    if (notmuch->index_as_text)
+	return NOTMUCH_STATUS_SUCCESS;
+
+    for (notmuch_config_values_t *list = notmuch_config_get_values (notmuch,
+								    NOTMUCH_CONFIG_INDEX_AS_TEXT);
+	 notmuch_config_values_valid (list);
+	 notmuch_config_values_move_to_next (list)) {
+	regex_t *new_regex;
+	int rerr;
+	const char *str = notmuch_config_values_get (list);
+	size_t len = strlen (str);
+
+	/* str must be non-empty, because n_c_get_values skips empty
+	 * strings */
+	assert (len > 0);
+
+	regexv = talloc_realloc (notmuch, regexv, regex_t, nregex + 1);
+	new_regex = &regexv[nregex];
+
+	rerr = regcomp (new_regex, str, REG_EXTENDED | REG_NOSUB);
+	if (rerr) {
+	    size_t error_size = regerror (rerr, new_regex, NULL, 0);
+	    char *error = (char *) talloc_size (str, error_size);
+
+	    regerror (rerr, new_regex, error, error_size);
+	    IGNORE_RESULT (asprintf (message, "Error in index.as_text: %s: %s\n", error, str));
+
+	    return NOTMUCH_STATUS_ILLEGAL_ARGUMENT;
+	}
+	nregex++;
+    }
+
+    notmuch->index_as_text = regexv;
+    notmuch->index_as_text_length = nregex;
+
+    return NOTMUCH_STATUS_SUCCESS;
+}
+
 static notmuch_status_t
 _finish_open (notmuch_database_t *notmuch,
 	      const char *profile,
@@ -531,6 +580,10 @@ _finish_open (notmuch_database_t *notmuch,
 	if (status)
 	    goto DONE;
 
+	status = _ensure_index_as_text (notmuch, &message);
+	if (status)
+	    goto DONE;
+
 	autocommit_str = notmuch_config_get (notmuch, NOTMUCH_CONFIG_AUTOCOMMIT);
 	if (unlikely (! autocommit_str)) {
 	    INTERNAL_ERROR ("missing configuration for autocommit");
-- 
2.35.2

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 3/3] lib: index attachements with mime types matching index.as_text
  2022-09-03 23:28 index user selected MIME types of attachments David Bremner
  2022-09-03 23:28 ` [PATCH 1/3] lib: add config key INDEX_AS_TEXT David Bremner
  2022-09-03 23:28 ` [PATCH 2/3] lib: parse index.as_text David Bremner
@ 2022-09-03 23:28 ` David Bremner
  2 siblings, 0 replies; 4+ messages in thread
From: David Bremner @ 2022-09-03 23:28 UTC (permalink / raw)
  To: notmuch

Instead of skipping indexing all attachments, we check of a (user
configured) mime type that is indexable as text.
---
 doc/man1/notmuch-config.rst |  7 +++++++
 lib/database.cc             | 12 ++++++++++++
 lib/index.cc                | 25 ++++++++++++++++++++++---
 lib/notmuch-private.h       |  4 ++++
 test/T050-new.sh            | 37 ++++++++++++++++++++++++++++++++++++-
 5 files changed, 81 insertions(+), 4 deletions(-)

diff --git a/doc/man1/notmuch-config.rst b/doc/man1/notmuch-config.rst
index 388315f6..caa3bd65 100644
--- a/doc/man1/notmuch-config.rst
+++ b/doc/man1/notmuch-config.rst
@@ -122,6 +122,13 @@ paths are presumed relative to `$HOME` for items in section
 
     Default tag prefix (filter) for :any:`notmuch-git`.
 
+.. nmconfig:: index.as_text
+
+   List of regular expressions (without delimiters) for MIME types to
+   be indexed as text. Currently this applies only to attachments.
+
+   History: This configuration value was introduced in notmuch 0.38.
+
 .. nmconfig:: index.decrypt
 
     Policy for decrypting encrypted messages during indexing.  Must be
diff --git a/lib/database.cc b/lib/database.cc
index c05d70d3..6b962a15 100644
--- a/lib/database.cc
+++ b/lib/database.cc
@@ -1573,3 +1573,15 @@ notmuch_database_status_string (const notmuch_database_t *notmuch)
 {
     return notmuch->status_string;
 }
+
+bool
+_notmuch_database_indexable_as_text (notmuch_database_t *notmuch, const char *mime_string)
+{
+    for (size_t i = 0; i < notmuch->index_as_text_length; i++) {
+	if (regexec (&notmuch->index_as_text[i], mime_string, 0, NULL, 0) == 0) {
+	    return true;
+	}
+    }
+
+    return false;
+}
diff --git a/lib/index.cc b/lib/index.cc
index 728bfb22..629dcb22 100644
--- a/lib/index.cc
+++ b/lib/index.cc
@@ -380,6 +380,23 @@ _index_pkcs7_part (notmuch_message_t *message,
 		   GMimeObject *part,
 		   _notmuch_message_crypto_t *msg_crypto);
 
+static bool
+_indexable_as_text (notmuch_message_t *message, GMimeObject *part)
+{
+    GMimeContentType *content_type = g_mime_object_get_content_type (part);
+    notmuch_database_t *notmuch = notmuch_message_get_database (message);
+
+    if (content_type) {
+	char *mime_string = g_mime_content_type_get_mime_type (content_type);
+	if (mime_string) {
+	    bool ret = _notmuch_database_indexable_as_text (notmuch, mime_string);
+	    g_free (mime_string);
+	    return ret;
+	}
+    }
+    return false;
+}
+
 /* Callback to generate terms for each mime part of a message. */
 static void
 _index_mime_part (notmuch_message_t *message,
@@ -497,9 +514,11 @@ _index_mime_part (notmuch_message_t *message,
 	_notmuch_message_add_term (message, "tag", "attachment");
 	_notmuch_message_gen_terms (message, "attachment", filename);
 
-	/* XXX: Would be nice to call out to something here to parse
-	 * the attachment into text and then index that. */
-	goto DONE;
+	if (! _indexable_as_text (message, part)) {
+	    /* XXX: Would be nice to call out to something here to parse
+	     * the attachment into text and then index that. */
+	    goto DONE;
+	}
     }
 
     byte_array = g_byte_array_new ();
diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
index 1d3d2b0c..c19ee8e2 100644
--- a/lib/notmuch-private.h
+++ b/lib/notmuch-private.h
@@ -259,6 +259,10 @@ _notmuch_database_filename_to_direntry (void *ctx,
 					notmuch_find_flags_t flags,
 					char **direntry);
 
+bool
+_notmuch_database_indexable_as_text (notmuch_database_t *notmuch,
+				     const char *mime_string);
+
 /* directory.cc */
 
 notmuch_directory_t *
diff --git a/test/T050-new.sh b/test/T050-new.sh
index cb67889c..427c5b22 100755
--- a/test/T050-new.sh
+++ b/test/T050-new.sh
@@ -455,12 +455,47 @@ Date: Fri, 17 Jun 2016 22:14:41 -0400
 EOF
 test_expect_equal_file EXPECTED OUTPUT
 
+add_email_corpus indexing
+test_begin_subtest "index text/* attachments, no config"
+messages=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain)
+count=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz)
+test_expect_equal "$messages,$count" "1,0"
+
+notmuch config set index.as_text "text/"
 add_email_corpus indexing
 
 test_begin_subtest "index text/* attachments"
-test_subtest_known_broken
 notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
 notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
 test_expect_equal_file_nonempty EXPECTED OUTPUT
 
+test_begin_subtest "reindex text/* attachments, no config"
+notmuch config set index.as_text
+notmuch reindex '*'
+messages=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain)
+count=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz)
+test_expect_equal "$messages,$count" "1,0"
+
+test_begin_subtest "reindex text/* attachments"
+notmuch config set index.as_text text/
+notmuch reindex '*'
+notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
+notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
+test_expect_equal_file_nonempty EXPECTED OUTPUT
+
+test_begin_subtest "reindex text/* attachments, second regex"
+notmuch config set index.as_text "blahblah;text/"
+notmuch reindex '*'
+notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
+notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
+test_expect_equal_file_nonempty EXPECTED OUTPUT
+
+test_begin_subtest "reindex text/* attachments, bad regex"
+notmuch config set index.as_text '['
+notmuch reindex '*' >& OUTPUT
+cat<<EOF > EXPECTED
+Error in index.as_text: Invalid regular expression: [
+EOF
+test_expect_equal_file EXPECTED OUTPUT
+
 test_done
-- 
2.35.2

^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2022-09-03 23:29 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-09-03 23:28 index user selected MIME types of attachments David Bremner
2022-09-03 23:28 ` [PATCH 1/3] lib: add config key INDEX_AS_TEXT David Bremner
2022-09-03 23:28 ` [PATCH 2/3] lib: parse index.as_text David Bremner
2022-09-03 23:28 ` [PATCH 3/3] lib: index attachements with mime types matching index.as_text David Bremner

Code repositories for project(s) associated with this public inbox

	https://yhetil.org/notmuch.git/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).