From: David Bremner <david@tethera.net>
To: notmuch@notmuchmail.org
Subject: [PATCH v2 3/3] lib: index attachments with mime types matching index.as_text
Date: Thu, 5 Jan 2023 20:02:06 -0400 [thread overview]
Message-ID: <20230106000206.3595708-4-david@tethera.net> (raw)
In-Reply-To: <20230106000206.3595708-1-david@tethera.net>
Instead of skipping indexing all attachments, we check of a (user
configured) mime type that is indexable as text.
---
doc/man1/notmuch-config.rst | 10 ++++
lib/database.cc | 12 +++++
lib/index.cc | 25 ++++++++--
lib/notmuch-private.h | 4 ++
test/T050-new.sh | 8 ----
test/T760-as-text.sh | 77 ++++++++++++++++++++++++++++++
test/corpora/indexing/fake-pdf:2,S | 11 +++++
7 files changed, 136 insertions(+), 11 deletions(-)
create mode 100755 test/T760-as-text.sh
create mode 100644 test/corpora/indexing/fake-pdf:2,S
diff --git a/doc/man1/notmuch-config.rst b/doc/man1/notmuch-config.rst
index 388315f6..988dc5a3 100644
--- a/doc/man1/notmuch-config.rst
+++ b/doc/man1/notmuch-config.rst
@@ -122,6 +122,16 @@ paths are presumed relative to `$HOME` for items in section
Default tag prefix (filter) for :any:`notmuch-git`.
+.. nmconfig:: index.as_text
+
+ List of regular expressions (without delimiters) for MIME types to
+ be indexed as text. Currently this applies only to attachments. By
+ default the regex matches anywhere in the content type; if they
+ user wants an anchored match, they should include anchors in their
+ regexes.
+
+ History: This configuration value was introduced in notmuch 0.38.
+
.. nmconfig:: index.decrypt
Policy for decrypting encrypted messages during indexing. Must be
diff --git a/lib/database.cc b/lib/database.cc
index d1e5f1af..6987e2f4 100644
--- a/lib/database.cc
+++ b/lib/database.cc
@@ -1573,3 +1573,15 @@ notmuch_database_status_string (const notmuch_database_t *notmuch)
{
return notmuch->status_string;
}
+
+bool
+_notmuch_database_indexable_as_text (notmuch_database_t *notmuch, const char *mime_string)
+{
+ for (size_t i = 0; i < notmuch->index_as_text_length; i++) {
+ if (regexec (¬much->index_as_text[i], mime_string, 0, NULL, 0) == 0) {
+ return true;
+ }
+ }
+
+ return false;
+}
diff --git a/lib/index.cc b/lib/index.cc
index 728bfb22..629dcb22 100644
--- a/lib/index.cc
+++ b/lib/index.cc
@@ -380,6 +380,23 @@ _index_pkcs7_part (notmuch_message_t *message,
GMimeObject *part,
_notmuch_message_crypto_t *msg_crypto);
+static bool
+_indexable_as_text (notmuch_message_t *message, GMimeObject *part)
+{
+ GMimeContentType *content_type = g_mime_object_get_content_type (part);
+ notmuch_database_t *notmuch = notmuch_message_get_database (message);
+
+ if (content_type) {
+ char *mime_string = g_mime_content_type_get_mime_type (content_type);
+ if (mime_string) {
+ bool ret = _notmuch_database_indexable_as_text (notmuch, mime_string);
+ g_free (mime_string);
+ return ret;
+ }
+ }
+ return false;
+}
+
/* Callback to generate terms for each mime part of a message. */
static void
_index_mime_part (notmuch_message_t *message,
@@ -497,9 +514,11 @@ _index_mime_part (notmuch_message_t *message,
_notmuch_message_add_term (message, "tag", "attachment");
_notmuch_message_gen_terms (message, "attachment", filename);
- /* XXX: Would be nice to call out to something here to parse
- * the attachment into text and then index that. */
- goto DONE;
+ if (! _indexable_as_text (message, part)) {
+ /* XXX: Would be nice to call out to something here to parse
+ * the attachment into text and then index that. */
+ goto DONE;
+ }
}
byte_array = g_byte_array_new ();
diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
index 1d3d2b0c..c19ee8e2 100644
--- a/lib/notmuch-private.h
+++ b/lib/notmuch-private.h
@@ -259,6 +259,10 @@ _notmuch_database_filename_to_direntry (void *ctx,
notmuch_find_flags_t flags,
char **direntry);
+bool
+_notmuch_database_indexable_as_text (notmuch_database_t *notmuch,
+ const char *mime_string);
+
/* directory.cc */
notmuch_directory_t *
diff --git a/test/T050-new.sh b/test/T050-new.sh
index cb67889c..6791f87c 100755
--- a/test/T050-new.sh
+++ b/test/T050-new.sh
@@ -455,12 +455,4 @@ Date: Fri, 17 Jun 2016 22:14:41 -0400
EOF
test_expect_equal_file EXPECTED OUTPUT
-add_email_corpus indexing
-
-test_begin_subtest "index text/* attachments"
-test_subtest_known_broken
-notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
-notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
-test_expect_equal_file_nonempty EXPECTED OUTPUT
-
test_done
diff --git a/test/T760-as-text.sh b/test/T760-as-text.sh
new file mode 100755
index 00000000..744567f2
--- /dev/null
+++ b/test/T760-as-text.sh
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+test_description='index attachments as text'
+. $(dirname "$0")/test-lib.sh || exit 1
+
+add_email_corpus indexing
+test_begin_subtest "empty as_text; skip text/x-diff"
+messages=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain)
+count=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz)
+test_expect_equal "$messages,$count" "1,0"
+
+notmuch config set index.as_text "^text/"
+add_email_corpus indexing
+
+test_begin_subtest "as_index is text/; find text/x-diff"
+notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
+notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
+test_expect_equal_file_nonempty EXPECTED OUTPUT
+
+test_begin_subtest "reindex with empty as_text, skips text/x-diff"
+notmuch config set index.as_text
+notmuch reindex '*'
+messages=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain)
+count=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz)
+test_expect_equal "$messages,$count" "1,0"
+
+test_begin_subtest "reindex with empty as_text; skips application/pdf"
+notmuch config set index.as_text
+notmuch reindex '*'
+gmessages=$(notmuch count id:871qo9p4tf.fsf@tethera.net)
+count=$(notmuch count id:871qo9p4tf.fsf@tethera.net and body:not-really-PDF)
+test_expect_equal "$messages,$count" "1,0"
+
+test_begin_subtest "reindex with as_text as text/; finds text/x-diff"
+notmuch config set index.as_text "^text/"
+notmuch reindex '*'
+notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
+notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
+test_expect_equal_file_nonempty EXPECTED OUTPUT
+
+test_begin_subtest "reindex with as_text as text/; skips application/pdf"
+notmuch config set index.as_text "^text/"
+notmuch config set index.as_text
+notmuch reindex '*'
+messages=$(notmuch count id:871qo9p4tf.fsf@tethera.net)
+count=$(notmuch count id:871qo9p4tf.fsf@tethera.net and body:not-really-PDF)
+test_expect_equal "$messages,$count" "1,0"
+
+test_begin_subtest "as_text has multiple regexes"
+notmuch config set index.as_text "blahblah;^text/"
+notmuch reindex '*'
+notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
+notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
+test_expect_equal_file_nonempty EXPECTED OUTPUT
+
+test_begin_subtest "as_text is non-anchored regex"
+notmuch config set index.as_text "e.t/"
+notmuch reindex '*'
+notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
+notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
+test_expect_equal_file_nonempty EXPECTED OUTPUT
+
+test_begin_subtest "as_text is 'application/pdf'"
+notmuch config set index.as_text "^application/pdf$"
+notmuch reindex '*'
+notmuch search id:871qo9p4tf.fsf@tethera.net > EXPECTED
+notmuch search id:871qo9p4tf.fsf@tethera.net and '"not really PDF"' > OUTPUT
+test_expect_equal_file_nonempty EXPECTED OUTPUT
+
+test_begin_subtest "as_text is bad regex"
+notmuch config set index.as_text '['
+notmuch reindex '*' >& OUTPUT
+cat<<EOF > EXPECTED
+Error in index.as_text: Invalid regular expression: [
+EOF
+test_expect_equal_file EXPECTED OUTPUT
+
+test_done
diff --git a/test/corpora/indexing/fake-pdf:2,S b/test/corpora/indexing/fake-pdf:2,S
new file mode 100644
index 00000000..60a7a47f
--- /dev/null
+++ b/test/corpora/indexing/fake-pdf:2,S
@@ -0,0 +1,11 @@
+From: David Bremner <david@tethera.net>
+To: example@example.com
+Subject: attachment content type
+Date: Thu, 05 Jan 2023 08:02:36 -0400
+Message-ID: <871qo9p4tf.fsf@tethera.net>
+MIME-Version: 1.0
+Content-Type: application/pdf
+Content-Disposition: attachment; filename=fake.pdf
+Content-Transfer-Encoding: base64
+
+dGhpcyBpcyBub3QgcmVhbGx5IFBERgo=
\ No newline at end of file
--
2.39.0
next prev parent reply other threads:[~2023-01-06 0:02 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-01-06 0:02 v2 Index some attachements as text David Bremner
2023-01-06 0:02 ` [PATCH v2 1/3] lib: add config key INDEX_AS_TEXT David Bremner
2023-01-06 0:02 ` [PATCH v2 2/3] lib: parse index.as_text David Bremner
2023-01-06 0:02 ` David Bremner [this message]
2023-04-02 22:37 ` v2 Index some attachements as text David Bremner
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://notmuchmail.org/
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230106000206.3595708-4-david@tethera.net \
--to=david@tethera.net \
--cc=notmuch@notmuchmail.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://yhetil.org/notmuch.git/
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).