unofficial mirror of notmuch@notmuchmail.org
 help / color / mirror / code / Atom feed
* v2 Index some attachements as text
@ 2023-01-06  0:02 David Bremner
  2023-01-06  0:02 ` [PATCH v2 1/3] lib: add config key INDEX_AS_TEXT David Bremner
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: David Bremner @ 2023-01-06  0:02 UTC (permalink / raw)
  To: notmuch

This obsoletes the series starting with

     id:20220903232839.1473915-2-david@tethera.net

Compared to that series this is rebased against master, it has some
more tests (including both positive and negative tests) and it
documents the non-anchoredness of the involved regex search.


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v2 1/3] lib: add config key INDEX_AS_TEXT
  2023-01-06  0:02 v2 Index some attachements as text David Bremner
@ 2023-01-06  0:02 ` David Bremner
  2023-01-06  0:02 ` [PATCH v2 2/3] lib: parse index.as_text David Bremner
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: David Bremner @ 2023-01-06  0:02 UTC (permalink / raw)
  To: notmuch

Higher level processing as a list of regular expressions and
documentation will follow.
---
 lib/config.cc            | 3 +++
 lib/notmuch.h            | 1 +
 test/T030-config.sh      | 1 +
 test/T055-path-config.sh | 1 +
 test/T590-libconfig.sh   | 5 +++++
 5 files changed, 11 insertions(+)

diff --git a/lib/config.cc b/lib/config.cc
index 503a0c8b..2323860d 100644
--- a/lib/config.cc
+++ b/lib/config.cc
@@ -599,6 +599,8 @@ _notmuch_config_key_to_string (notmuch_config_key_t key)
 	return "database.autocommit";
     case NOTMUCH_CONFIG_EXTRA_HEADERS:
 	return "show.extra_headers";
+    case NOTMUCH_CONFIG_INDEX_AS_TEXT:
+	return "index.as_text";
     default:
 	return NULL;
     }
@@ -642,6 +644,7 @@ _notmuch_config_default (notmuch_database_t *notmuch, notmuch_config_key_t key)
 	else
 	    email = _get_email_from_passwd_file (notmuch);
 	return email;
+    case NOTMUCH_CONFIG_INDEX_AS_TEXT:
     case NOTMUCH_CONFIG_NEW_IGNORE:
 	return "";
     case NOTMUCH_CONFIG_AUTOCOMMIT:
diff --git a/lib/notmuch.h b/lib/notmuch.h
index 0b0540b1..935a8d59 100644
--- a/lib/notmuch.h
+++ b/lib/notmuch.h
@@ -2558,6 +2558,7 @@ typedef enum {
     NOTMUCH_CONFIG_USER_NAME,
     NOTMUCH_CONFIG_AUTOCOMMIT,
     NOTMUCH_CONFIG_EXTRA_HEADERS,
+    NOTMUCH_CONFIG_INDEX_AS_TEXT,
     NOTMUCH_CONFIG_LAST
 } notmuch_config_key_t;
 
diff --git a/test/T030-config.sh b/test/T030-config.sh
index 43bbce31..ea0b4012 100755
--- a/test/T030-config.sh
+++ b/test/T030-config.sh
@@ -57,6 +57,7 @@ database.mail_root=MAIL_DIR
 database.path=MAIL_DIR
 foo.list=this;is another;list value;
 foo.string=this is another string value
+index.as_text=
 maildir.synchronize_flags=true
 new.ignore=
 new.tags=unread;inbox
diff --git a/test/T055-path-config.sh b/test/T055-path-config.sh
index fe295324..efc79e8b 100755
--- a/test/T055-path-config.sh
+++ b/test/T055-path-config.sh
@@ -299,6 +299,7 @@ database.backup_dir
 database.hook_dir
 database.mail_root=MAIL_DIR
 database.path
+index.as_text=
 maildir.synchronize_flags=true
 new.ignore=
 new.tags=unread;inbox
diff --git a/test/T590-libconfig.sh b/test/T590-libconfig.sh
index 26a1f033..9326ba3e 100755
--- a/test/T590-libconfig.sh
+++ b/test/T590-libconfig.sh
@@ -440,6 +440,7 @@ cat <<'EOF' >EXPECTED
 10: 'USER_FULL_NAME'
 11: '8000'
 12: 'NULL'
+13: ''
 == stderr ==
 EOF
 unset MAILDIR
@@ -725,6 +726,7 @@ test_expect_equal_file EXPECTED OUTPUT
 test_begin_subtest "list by keys (ndlc)"
 notmuch config set search.exclude_tags "foo;bar;fub"
 notmuch config set new.ignore "sekrit_junk"
+notmuch config set index.as_text "text/"
 cat c_head2 - c_tail <<'EOF' | test_C ${MAIL_DIR} %NULL% %NULL%
 {
     notmuch_config_key_t key;
@@ -751,6 +753,7 @@ cat <<'EOF' >EXPECTED
 10: 'Notmuch Test Suite'
 11: '8000'
 12: 'NULL'
+13: 'text/'
 == stderr ==
 EOF
 test_expect_equal_file EXPECTED OUTPUT
@@ -785,6 +788,7 @@ cat <<'EOF' >EXPECTED
 10: 'USER_FULL_NAME'
 11: '8000'
 12: 'NULL'
+13: ''
 == stderr ==
 EOF
 test_expect_equal_file EXPECTED OUTPUT.clean
@@ -856,6 +860,7 @@ database.backup_dir MAIL_DIR/.notmuch/backups
 database.hook_dir MAIL_DIR/.notmuch/hooks
 database.mail_root MAIL_DIR
 database.path MAIL_DIR
+index.as_text text/
 key with spaces value, with, spaces!
 maildir.synchronize_flags true
 new.ignore sekrit_junk
-- 
2.39.0

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH v2 2/3] lib: parse index.as_text
  2023-01-06  0:02 v2 Index some attachements as text David Bremner
  2023-01-06  0:02 ` [PATCH v2 1/3] lib: add config key INDEX_AS_TEXT David Bremner
@ 2023-01-06  0:02 ` David Bremner
  2023-01-06  0:02 ` [PATCH v2 3/3] lib: index attachments with mime types matching index.as_text David Bremner
  2023-04-02 22:37 ` v2 Index some attachements as text David Bremner
  3 siblings, 0 replies; 5+ messages in thread
From: David Bremner @ 2023-01-06  0:02 UTC (permalink / raw)
  To: notmuch

We pre-parse into a list of compiled regular expressions to avoid
calling regexc on the hot (indexing) path.  As explained in the code
comment, this cannot be done lazily with reasonable error reporting,
at least not without touching a lot of the code in index.cc.
---
 lib/database-private.h |  4 ++++
 lib/open.cc            | 53 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)

diff --git a/lib/database-private.h b/lib/database-private.h
index b9be4e22..61232f1a 100644
--- a/lib/database-private.h
+++ b/lib/database-private.h
@@ -291,6 +291,10 @@ struct _notmuch_database {
 
     /* Track what parameters were specified when opening */
     notmuch_open_param_t params;
+
+    /* list of regular expressions to check for text indexing */
+    regex_t *index_as_text;
+    size_t index_as_text_length;
 };
 
 /* Prior to database version 3, features were implied by the database
diff --git a/lib/open.cc b/lib/open.cc
index 67ff868c..54d1faf3 100644
--- a/lib/open.cc
+++ b/lib/open.cc
@@ -320,6 +320,8 @@ _alloc_notmuch (const char *database_path, const char *config_path, const char *
     notmuch->transaction_count = 0;
     notmuch->transaction_threshold = 0;
     notmuch->view = 1;
+    notmuch->index_as_text = NULL;
+    notmuch->index_as_text_length = 0;
 
     notmuch->params = NOTMUCH_PARAM_NONE;
     if (database_path)
@@ -427,6 +429,53 @@ _load_database_state (notmuch_database_t *notmuch)
 	notmuch, notmuch->xapian_db->get_uuid ().c_str ());
 }
 
+/* XXX This should really be done lazily, but the error reporting path in the indexing code
+ * would need to be redone to report any errors.
+ */
+notmuch_status_t
+_ensure_index_as_text (notmuch_database_t *notmuch, char **message)
+{
+    int nregex = 0;
+    regex_t *regexv = NULL;
+
+    if (notmuch->index_as_text)
+	return NOTMUCH_STATUS_SUCCESS;
+
+    for (notmuch_config_values_t *list = notmuch_config_get_values (notmuch,
+								    NOTMUCH_CONFIG_INDEX_AS_TEXT);
+	 notmuch_config_values_valid (list);
+	 notmuch_config_values_move_to_next (list)) {
+	regex_t *new_regex;
+	int rerr;
+	const char *str = notmuch_config_values_get (list);
+	size_t len = strlen (str);
+
+	/* str must be non-empty, because n_c_get_values skips empty
+	 * strings */
+	assert (len > 0);
+
+	regexv = talloc_realloc (notmuch, regexv, regex_t, nregex + 1);
+	new_regex = &regexv[nregex];
+
+	rerr = regcomp (new_regex, str, REG_EXTENDED | REG_NOSUB);
+	if (rerr) {
+	    size_t error_size = regerror (rerr, new_regex, NULL, 0);
+	    char *error = (char *) talloc_size (str, error_size);
+
+	    regerror (rerr, new_regex, error, error_size);
+	    IGNORE_RESULT (asprintf (message, "Error in index.as_text: %s: %s\n", error, str));
+
+	    return NOTMUCH_STATUS_ILLEGAL_ARGUMENT;
+	}
+	nregex++;
+    }
+
+    notmuch->index_as_text = regexv;
+    notmuch->index_as_text_length = nregex;
+
+    return NOTMUCH_STATUS_SUCCESS;
+}
+
 static notmuch_status_t
 _finish_open (notmuch_database_t *notmuch,
 	      const char *profile,
@@ -531,6 +580,10 @@ _finish_open (notmuch_database_t *notmuch,
 	if (status)
 	    goto DONE;
 
+	status = _ensure_index_as_text (notmuch, &message);
+	if (status)
+	    goto DONE;
+
 	autocommit_str = notmuch_config_get (notmuch, NOTMUCH_CONFIG_AUTOCOMMIT);
 	if (unlikely (! autocommit_str)) {
 	    INTERNAL_ERROR ("missing configuration for autocommit");
-- 
2.39.0

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH v2 3/3] lib: index attachments with mime types matching index.as_text
  2023-01-06  0:02 v2 Index some attachements as text David Bremner
  2023-01-06  0:02 ` [PATCH v2 1/3] lib: add config key INDEX_AS_TEXT David Bremner
  2023-01-06  0:02 ` [PATCH v2 2/3] lib: parse index.as_text David Bremner
@ 2023-01-06  0:02 ` David Bremner
  2023-04-02 22:37 ` v2 Index some attachements as text David Bremner
  3 siblings, 0 replies; 5+ messages in thread
From: David Bremner @ 2023-01-06  0:02 UTC (permalink / raw)
  To: notmuch

Instead of skipping indexing all attachments, we check of a (user
configured) mime type that is indexable as text.
---
 doc/man1/notmuch-config.rst        | 10 ++++
 lib/database.cc                    | 12 +++++
 lib/index.cc                       | 25 ++++++++--
 lib/notmuch-private.h              |  4 ++
 test/T050-new.sh                   |  8 ----
 test/T760-as-text.sh               | 77 ++++++++++++++++++++++++++++++
 test/corpora/indexing/fake-pdf:2,S | 11 +++++
 7 files changed, 136 insertions(+), 11 deletions(-)
 create mode 100755 test/T760-as-text.sh
 create mode 100644 test/corpora/indexing/fake-pdf:2,S

diff --git a/doc/man1/notmuch-config.rst b/doc/man1/notmuch-config.rst
index 388315f6..988dc5a3 100644
--- a/doc/man1/notmuch-config.rst
+++ b/doc/man1/notmuch-config.rst
@@ -122,6 +122,16 @@ paths are presumed relative to `$HOME` for items in section
 
     Default tag prefix (filter) for :any:`notmuch-git`.
 
+.. nmconfig:: index.as_text
+
+   List of regular expressions (without delimiters) for MIME types to
+   be indexed as text. Currently this applies only to attachments.  By
+   default the regex matches anywhere in the content type; if they
+   user wants an anchored match, they should include anchors in their
+   regexes.
+
+   History: This configuration value was introduced in notmuch 0.38.
+
 .. nmconfig:: index.decrypt
 
     Policy for decrypting encrypted messages during indexing.  Must be
diff --git a/lib/database.cc b/lib/database.cc
index d1e5f1af..6987e2f4 100644
--- a/lib/database.cc
+++ b/lib/database.cc
@@ -1573,3 +1573,15 @@ notmuch_database_status_string (const notmuch_database_t *notmuch)
 {
     return notmuch->status_string;
 }
+
+bool
+_notmuch_database_indexable_as_text (notmuch_database_t *notmuch, const char *mime_string)
+{
+    for (size_t i = 0; i < notmuch->index_as_text_length; i++) {
+	if (regexec (&notmuch->index_as_text[i], mime_string, 0, NULL, 0) == 0) {
+	    return true;
+	}
+    }
+
+    return false;
+}
diff --git a/lib/index.cc b/lib/index.cc
index 728bfb22..629dcb22 100644
--- a/lib/index.cc
+++ b/lib/index.cc
@@ -380,6 +380,23 @@ _index_pkcs7_part (notmuch_message_t *message,
 		   GMimeObject *part,
 		   _notmuch_message_crypto_t *msg_crypto);
 
+static bool
+_indexable_as_text (notmuch_message_t *message, GMimeObject *part)
+{
+    GMimeContentType *content_type = g_mime_object_get_content_type (part);
+    notmuch_database_t *notmuch = notmuch_message_get_database (message);
+
+    if (content_type) {
+	char *mime_string = g_mime_content_type_get_mime_type (content_type);
+	if (mime_string) {
+	    bool ret = _notmuch_database_indexable_as_text (notmuch, mime_string);
+	    g_free (mime_string);
+	    return ret;
+	}
+    }
+    return false;
+}
+
 /* Callback to generate terms for each mime part of a message. */
 static void
 _index_mime_part (notmuch_message_t *message,
@@ -497,9 +514,11 @@ _index_mime_part (notmuch_message_t *message,
 	_notmuch_message_add_term (message, "tag", "attachment");
 	_notmuch_message_gen_terms (message, "attachment", filename);
 
-	/* XXX: Would be nice to call out to something here to parse
-	 * the attachment into text and then index that. */
-	goto DONE;
+	if (! _indexable_as_text (message, part)) {
+	    /* XXX: Would be nice to call out to something here to parse
+	     * the attachment into text and then index that. */
+	    goto DONE;
+	}
     }
 
     byte_array = g_byte_array_new ();
diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
index 1d3d2b0c..c19ee8e2 100644
--- a/lib/notmuch-private.h
+++ b/lib/notmuch-private.h
@@ -259,6 +259,10 @@ _notmuch_database_filename_to_direntry (void *ctx,
 					notmuch_find_flags_t flags,
 					char **direntry);
 
+bool
+_notmuch_database_indexable_as_text (notmuch_database_t *notmuch,
+				     const char *mime_string);
+
 /* directory.cc */
 
 notmuch_directory_t *
diff --git a/test/T050-new.sh b/test/T050-new.sh
index cb67889c..6791f87c 100755
--- a/test/T050-new.sh
+++ b/test/T050-new.sh
@@ -455,12 +455,4 @@ Date: Fri, 17 Jun 2016 22:14:41 -0400
 EOF
 test_expect_equal_file EXPECTED OUTPUT
 
-add_email_corpus indexing
-
-test_begin_subtest "index text/* attachments"
-test_subtest_known_broken
-notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
-notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
-test_expect_equal_file_nonempty EXPECTED OUTPUT
-
 test_done
diff --git a/test/T760-as-text.sh b/test/T760-as-text.sh
new file mode 100755
index 00000000..744567f2
--- /dev/null
+++ b/test/T760-as-text.sh
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+test_description='index attachments as text'
+. $(dirname "$0")/test-lib.sh || exit 1
+
+add_email_corpus indexing
+test_begin_subtest "empty as_text; skip text/x-diff"
+messages=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain)
+count=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz)
+test_expect_equal "$messages,$count" "1,0"
+
+notmuch config set index.as_text "^text/"
+add_email_corpus indexing
+
+test_begin_subtest "as_index is text/; find text/x-diff"
+notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
+notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
+test_expect_equal_file_nonempty EXPECTED OUTPUT
+
+test_begin_subtest "reindex with empty as_text, skips text/x-diff"
+notmuch config set index.as_text
+notmuch reindex '*'
+messages=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain)
+count=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz)
+test_expect_equal "$messages,$count" "1,0"
+
+test_begin_subtest "reindex with empty as_text; skips application/pdf"
+notmuch config set index.as_text
+notmuch reindex '*'
+gmessages=$(notmuch count id:871qo9p4tf.fsf@tethera.net)
+count=$(notmuch count id:871qo9p4tf.fsf@tethera.net and body:not-really-PDF)
+test_expect_equal "$messages,$count" "1,0"
+
+test_begin_subtest "reindex with as_text as text/; finds text/x-diff"
+notmuch config set index.as_text "^text/"
+notmuch reindex '*'
+notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
+notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
+test_expect_equal_file_nonempty EXPECTED OUTPUT
+
+test_begin_subtest "reindex with as_text as text/; skips application/pdf"
+notmuch config set index.as_text "^text/"
+notmuch config set index.as_text
+notmuch reindex '*'
+messages=$(notmuch count id:871qo9p4tf.fsf@tethera.net)
+count=$(notmuch count id:871qo9p4tf.fsf@tethera.net and body:not-really-PDF)
+test_expect_equal "$messages,$count" "1,0"
+
+test_begin_subtest "as_text has multiple regexes"
+notmuch config set index.as_text "blahblah;^text/"
+notmuch reindex '*'
+notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
+notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
+test_expect_equal_file_nonempty EXPECTED OUTPUT
+
+test_begin_subtest "as_text is non-anchored regex"
+notmuch config set index.as_text "e.t/"
+notmuch reindex '*'
+notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
+notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
+test_expect_equal_file_nonempty EXPECTED OUTPUT
+
+test_begin_subtest "as_text is 'application/pdf'"
+notmuch config set index.as_text "^application/pdf$"
+notmuch reindex '*'
+notmuch search id:871qo9p4tf.fsf@tethera.net > EXPECTED
+notmuch search id:871qo9p4tf.fsf@tethera.net and '"not really PDF"' > OUTPUT
+test_expect_equal_file_nonempty EXPECTED OUTPUT
+
+test_begin_subtest "as_text is bad regex"
+notmuch config set index.as_text '['
+notmuch reindex '*' >& OUTPUT
+cat<<EOF > EXPECTED
+Error in index.as_text: Invalid regular expression: [
+EOF
+test_expect_equal_file EXPECTED OUTPUT
+
+test_done
diff --git a/test/corpora/indexing/fake-pdf:2,S b/test/corpora/indexing/fake-pdf:2,S
new file mode 100644
index 00000000..60a7a47f
--- /dev/null
+++ b/test/corpora/indexing/fake-pdf:2,S
@@ -0,0 +1,11 @@
+From: David Bremner <david@tethera.net>
+To: example@example.com
+Subject: attachment content type
+Date: Thu, 05 Jan 2023 08:02:36 -0400
+Message-ID: <871qo9p4tf.fsf@tethera.net>
+MIME-Version: 1.0
+Content-Type: application/pdf
+Content-Disposition: attachment; filename=fake.pdf
+Content-Transfer-Encoding: base64
+
+dGhpcyBpcyBub3QgcmVhbGx5IFBERgo=
\ No newline at end of file
-- 
2.39.0

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: v2 Index some attachements as text
  2023-01-06  0:02 v2 Index some attachements as text David Bremner
                   ` (2 preceding siblings ...)
  2023-01-06  0:02 ` [PATCH v2 3/3] lib: index attachments with mime types matching index.as_text David Bremner
@ 2023-04-02 22:37 ` David Bremner
  3 siblings, 0 replies; 5+ messages in thread
From: David Bremner @ 2023-04-02 22:37 UTC (permalink / raw)
  To: notmuch

David Bremner <david@tethera.net> writes:

> This obsoletes the series starting with
>
>      id:20220903232839.1473915-2-david@tethera.net
>
> Compared to that series this is rebased against master, it has some
> more tests (including both positive and negative tests) and it
> documents the non-anchoredness of the involved regex search.
>

I have applied this series to master.

d

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2023-04-02 22:37 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-06  0:02 v2 Index some attachements as text David Bremner
2023-01-06  0:02 ` [PATCH v2 1/3] lib: add config key INDEX_AS_TEXT David Bremner
2023-01-06  0:02 ` [PATCH v2 2/3] lib: parse index.as_text David Bremner
2023-01-06  0:02 ` [PATCH v2 3/3] lib: index attachments with mime types matching index.as_text David Bremner
2023-04-02 22:37 ` v2 Index some attachements as text David Bremner

Code repositories for project(s) associated with this public inbox

	https://yhetil.org/notmuch.git/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).