From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from localhost (localhost [127.0.0.1]) by arlo.cworth.org (Postfix) with ESMTP id 02C226DE01BE for ; Thu, 9 Jun 2016 19:28:35 -0700 (PDT) X-Virus-Scanned: Debian amavisd-new at cworth.org X-Spam-Flag: NO X-Spam-Score: -0.011 X-Spam-Level: X-Spam-Status: No, score=-0.011 tagged_above=-999 required=5 tests=[AWL=-0.000, SPF_PASS=-0.001, T_RP_MATCHES_RCVD=-0.01] autolearn=disabled Received: from arlo.cworth.org ([127.0.0.1]) by localhost (arlo.cworth.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id BskbBxEtfBwE for ; Thu, 9 Jun 2016 19:28:26 -0700 (PDT) Received: from fethera.tethera.net (fethera.tethera.net [198.245.60.197]) by arlo.cworth.org (Postfix) with ESMTPS id 1EE566DE0130 for ; Thu, 9 Jun 2016 19:28:26 -0700 (PDT) Received: from remotemail by fethera.tethera.net with local (Exim 4.84) (envelope-from ) id 1bBCAz-00032U-41; Thu, 09 Jun 2016 22:28:01 -0400 Received: (nullmailer pid 30971 invoked by uid 1000); Fri, 10 Jun 2016 02:28:12 -0000 From: David Bremner To: David Bremner , Austin Clements Cc: sfischme@uwaterloo.ca, Gaute Hope , notmuch Subject: [PATCH] WIP: regexp matching in 'subject' and 'from' Date: Thu, 9 Jun 2016 23:28:08 -0300 Message-Id: <1465525688-30913-1-git-send-email-david@tethera.net> X-Mailer: git-send-email 2.8.1 In-Reply-To: <1465265149-7174-1-git-send-email-david@tethera.net> References: <1465265149-7174-1-git-send-email-david@tethera.net> MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: notmuch@notmuchmail.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: "Use and development of the notmuch mail system." List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 10 Jun 2016 02:28:35 -0000 the idea is that you can run % notmuch search subject_re: % notmuch search from_re:' or % notmuch search subject:"your usual phrase search" % notmuch search from:"usual phrase search" This should also work with bindings, since it extends the query parser. This is trivial to extend for other value slots, but currently the only value slots are date, message_id, from, subject, and last_mod. Date is already searchable, and message_id is not obviously useful to regex match. --- This is more or less complete codewise, it fixes the know problems with the last version. Names of prefixes are debatable, and of course it needs doc and tests. I don't see any reason not to do this at the moment, since it's basically free; no new terms are added to the database. lib/Makefile.local | 1 + lib/database-private.h | 2 + lib/database.cc | 12 +++++- lib/regexp-fields.cc | 102 +++++++++++++++++++++++++++++++++++++++++++++++++ lib/regexp-fields.h | 78 +++++++++++++++++++++++++++++++++++++ 5 files changed, 194 insertions(+), 1 deletion(-) create mode 100644 lib/regexp-fields.cc create mode 100644 lib/regexp-fields.h diff --git a/lib/Makefile.local b/lib/Makefile.local index beb9635..68771e6 100644 --- a/lib/Makefile.local +++ b/lib/Makefile.local @@ -51,6 +51,7 @@ libnotmuch_cxx_srcs = \ $(dir)/query.cc \ $(dir)/query-fp.cc \ $(dir)/config.cc \ + $(dir)/regexp-fields.cc \ $(dir)/thread.cc libnotmuch_modules := $(libnotmuch_c_srcs:.c=.o) $(libnotmuch_cxx_srcs:.cc=.o) diff --git a/lib/database-private.h b/lib/database-private.h index ca71a92..090fcdf 100644 --- a/lib/database-private.h +++ b/lib/database-private.h @@ -186,6 +186,8 @@ struct _notmuch_database { #if HAVE_XAPIAN_FIELD_PROCESSOR Xapian::FieldProcessor *date_field_processor; Xapian::FieldProcessor *query_field_processor; + Xapian::FieldProcessor *subject_re_field_processor; + Xapian::FieldProcessor *from_re_field_processor; #endif Xapian::ValueRangeProcessor *last_mod_range_processor; }; diff --git a/lib/database.cc b/lib/database.cc index 86bf261..4049406 100644 --- a/lib/database.cc +++ b/lib/database.cc @@ -21,6 +21,7 @@ #include "database-private.h" #include "parse-time-vrp.h" #include "query-fp.h" +#include "regexp-fields.h" #include "string-util.h" #include @@ -1008,6 +1009,10 @@ notmuch_database_open_verbose (const char *path, notmuch->query_parser->add_boolean_prefix("date", notmuch->date_field_processor); notmuch->query_field_processor = new QueryFieldProcessor (*notmuch->query_parser, notmuch); notmuch->query_parser->add_boolean_prefix("query", notmuch->query_field_processor); + notmuch->subject_re_field_processor = new RegexpFieldProcessor (NOTMUCH_VALUE_SUBJECT, *notmuch->query_parser, notmuch); + notmuch->query_parser->add_boolean_prefix("subject_re", notmuch->subject_re_field_processor); + notmuch->from_re_field_processor = new RegexpFieldProcessor (NOTMUCH_VALUE_FROM, *notmuch->query_parser, notmuch); + notmuch->query_parser->add_boolean_prefix("from_re", notmuch->from_re_field_processor); #endif notmuch->last_mod_range_processor = new Xapian::NumberValueRangeProcessor (NOTMUCH_VALUE_LAST_MOD, "lastmod:"); @@ -1098,7 +1103,12 @@ notmuch_database_close (notmuch_database_t *notmuch) notmuch->date_range_processor = NULL; delete notmuch->last_mod_range_processor; notmuch->last_mod_range_processor = NULL; - +#ifdef HAVE_XAPIAN_FIELD_PROCESSOR + delete notmuch->from_re_field_processor; + notmuch->from_re_field_processor = NULL; + delete notmuch->subject_re_field_processor; + notmuch->subject_re_field_processor = NULL; +#endif return status; } diff --git a/lib/regexp-fields.cc b/lib/regexp-fields.cc new file mode 100644 index 0000000..4bbebda --- /dev/null +++ b/lib/regexp-fields.cc @@ -0,0 +1,102 @@ +/* query-fp.cc - "query:" field processor glue + * + * This file is part of notmuch. + * + * Copyright © 2015 Austin Clements + * Copyright © 2016 David Bremner + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see https://www.gnu.org/licenses/ . + * + * Author: Austin Clements + * David Bremner + */ + +#include "regexp-fields.h" + +#ifdef HAVE_XAPIAN_FIELD_PROCESSOR +RegexpPostingSource::RegexpPostingSource (Xapian::valueno slot, const std::string ®exp) + : slot_ (slot) +{ + int r = regcomp (®exp_, regexp.c_str (), REG_EXTENDED | REG_NOSUB); + + if (r != 0) + /* XXX Report a query syntax error using regerror */ + throw "regcomp failed"; +} + +RegexpPostingSource::~RegexpPostingSource () +{ + regfree (®exp_); +} + +void +RegexpPostingSource::init (const Xapian::Database &db) +{ + db_ = db; + it_ = db_.valuestream_begin (slot_); + end_ = db.valuestream_end (slot_); + started_ = false; +} + +Xapian::doccount +RegexpPostingSource::get_termfreq_min () const +{ + return 0; +} + +Xapian::doccount +RegexpPostingSource::get_termfreq_est () const +{ + return get_termfreq_max () / 2; +} + +Xapian::doccount +RegexpPostingSource::get_termfreq_max () const +{ + return db_.get_value_freq (slot_); +} + +Xapian::docid +RegexpPostingSource::get_docid () const +{ + return it_.get_docid (); +} + +bool +RegexpPostingSource::at_end () const +{ + return it_ == end_; +} + +void +RegexpPostingSource::next (unused (double min_wt)) +{ + if (started_ && ! at_end ()) + ++it_; + started_ = true; + + for (; ! at_end (); ++it_) { + std::string value = *it_; + if (regexec (®exp_, value.c_str (), 0, NULL, 0) == 0) + break; + } +} + +Xapian::Query +RegexpFieldProcessor::operator() (const std::string & str) +{ + postings = new RegexpPostingSource (slot, str); + return Xapian::Query (postings); +} +#endif diff --git a/lib/regexp-fields.h b/lib/regexp-fields.h new file mode 100644 index 0000000..a184cab --- /dev/null +++ b/lib/regexp-fields.h @@ -0,0 +1,78 @@ +/* regex-fields.h - xapian glue for semi-bruteforce regexp search + * + * This file is part of notmuch. + * + * Copyright © 2015 Austin Clements + * Copyright © 2016 David Bremner + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see https://www.gnu.org/licenses/ . + * + * Author: Austin Clements + * David Bremner + */ + +#ifndef NOTMUCH_REGEXP_FIELDS_H +#define NOTMUCH_REGEXP_FIELDS_H +#ifdef HAVE_XAPIAN_FIELD_PROCESSOR +#include +#include +#include +#include "notmuch-private.h" + +/* A posting source that returns documents where a value matches a + * regexp. + */ +class RegexpPostingSource : public Xapian::PostingSource +{ + protected: + const Xapian::valueno slot_; + regex_t regexp_; + Xapian::Database db_; + bool started_; + Xapian::ValueIterator it_, end_; + +/* No copying */ + RegexpPostingSource (const RegexpPostingSource &); + RegexpPostingSource &operator= (const RegexpPostingSource &); + + public: + RegexpPostingSource (Xapian::valueno slot, const std::string ®exp); + ~RegexpPostingSource (); + void init (const Xapian::Database &db); + Xapian::doccount get_termfreq_min () const; + Xapian::doccount get_termfreq_est () const; + Xapian::doccount get_termfreq_max () const; + Xapian::docid get_docid () const; + bool at_end () const; + void next (unused (double min_wt)); +}; + + +class RegexpFieldProcessor : public Xapian::FieldProcessor { + protected: + Xapian::valueno slot; + Xapian::QueryParser &parser; + notmuch_database_t *notmuch; + RegexpPostingSource *postings = NULL; + + public: + RegexpFieldProcessor (Xapian::valueno slot_, Xapian::QueryParser &parser_, notmuch_database_t *notmuch_) + : slot(slot_), parser(parser_), notmuch(notmuch_) { }; + + ~RegexpFieldProcessor () { delete postings; }; + + Xapian::Query operator()(const std::string & str); +}; +#endif +#endif /* NOTMUCH_REGEXP_FIELDS_H */ -- 2.8.1