unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 1/3] sqlite: avoid incorrect/deprecated `LIKE' use
Date: Wed,  4 Dec 2024 19:39:12 +0000	[thread overview]
Message-ID: <20241204193914.3227868-2-e@80x24.org> (raw)
In-Reply-To: <20241204193914.3227868-1-e@80x24.org>

The `case_sensitive_like' pragma is deprecated since
SQLite 3.44+.  Furthermore, our use of `LIKE' in SharedKV->keys
seemed to be broken anyways since `LIKE' doesn't seem to
work with binary data (stored with SQL_BLOB), but neither does
`GLOB'.

So avoid `LIKE' entirely.  For non-SQL_BLOB data we'll favor the
always-case-sensitive GLOB.  For SQL_BLOB data, we must rely on
the Perl regexp engine from what I can tell.  `GLOB' is
preferred where possible since SQLite will be able to use
indices in some cases whereas `REGEXP' cannot.

Fixing SharedKV->keys should improve bash completion for lei.

Some common SQLite-related utilities are now in a
PublicInbox::SQLiteUtil package which will be expanded to deal
with more commonalities between SQLite users in our tree.
---
 MANIFEST                        |  2 ++
 lib/PublicInbox/ExtSearchIdx.pm | 12 ++++----
 lib/PublicInbox/LeiMailSync.pm  | 14 +++-------
 lib/PublicInbox/SQLiteUtil.pm   | 30 ++++++++++++++++++++
 lib/PublicInbox/SharedKV.pm     | 23 ++++++----------
 t/shared_kv.t                   |  5 ++++
 t/sqlite_util.t                 | 49 +++++++++++++++++++++++++++++++++
 7 files changed, 105 insertions(+), 30 deletions(-)
 create mode 100644 lib/PublicInbox/SQLiteUtil.pm
 create mode 100644 t/sqlite_util.t

diff --git a/MANIFEST b/MANIFEST
index 1305ed8a..b0b4f71c 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -334,6 +334,7 @@ lib/PublicInbox/RepoList.pm
 lib/PublicInbox/RepoSnapshot.pm
 lib/PublicInbox/RepoTree.pm
 lib/PublicInbox/SHA.pm
+lib/PublicInbox/SQLiteUtil.pm
 lib/PublicInbox/SaPlugin/ListMirror.pm
 lib/PublicInbox/SaPlugin/ListMirror.pod
 lib/PublicInbox/Search.pm
@@ -614,6 +615,7 @@ t/solve/bare.patch
 t/solver_git.t
 t/spamcheck_spamc.t
 t/spawn.t
+t/sqlite_util.t
 t/syscall.t
 t/tail_notify.t
 t/thread-cycle.t
diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index cead0f8a..d8db7d4b 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -21,6 +21,7 @@ use Carp qw(croak carp);
 use Scalar::Util qw(blessed);
 use Sys::Hostname qw(hostname);
 use File::Glob qw(bsd_glob GLOB_NOSORT);
+use PublicInbox::SQLiteUtil;
 use PublicInbox::Isearch;
 use PublicInbox::MultiGit;
 use PublicInbox::Spawn ();
@@ -461,15 +462,14 @@ EOM
 DELETE FROM inboxes WHERE ibx_id = ?
 
 		# drop last_commit info
-		my $pat = $eidx_key;
-		$pat =~ s/([_%\\])/\\$1/g;
-		$self->{oidx}->dbh->do('PRAGMA case_sensitive_like = ON');
+		# We use GLOB in addition to REGEXP since GLOB can use indices
 		my $lc_i = $self->{oidx}->dbh->prepare(<<'');
-SELECT key FROM eidx_meta WHERE key LIKE ? ESCAPE ?
+SELECT key FROM eidx_meta WHERE key GLOB ? AND key REGEXP ?
 
-		$lc_i->execute("lc-%:$pat//%", '\\');
+		my $ekg = 'lc-v[1-9]*:'.
+			PublicInbox::SQLiteUtil::escape_glob($eidx_key).'//*';
+		$lc_i->execute($ekg, qr!\Alc-v[1-9]+:\Q$eidx_key\E//!);
 		while (my ($key) = $lc_i->fetchrow_array) {
-			next if $key !~ m!\Alc-v[1-9]+:\Q$eidx_key\E//!;
 			warn "# removing $key\n";
 			$self->{oidx}->dbh->do(<<'', undef, $key);
 DELETE FROM eidx_meta WHERE key = ?
diff --git a/lib/PublicInbox/LeiMailSync.pm b/lib/PublicInbox/LeiMailSync.pm
index d0f6d7b4..d23f6b4b 100644
--- a/lib/PublicInbox/LeiMailSync.pm
+++ b/lib/PublicInbox/LeiMailSync.pm
@@ -12,6 +12,7 @@ use PublicInbox::ContentHash qw(git_sha);
 use Carp ();
 use PublicInbox::Git qw(%HEXLEN2SHA);
 use PublicInbox::IO qw(read_all);
+use PublicInbox::SQLiteUtil;
 
 sub dbh_new {
 	my ($self) = @_;
@@ -31,7 +32,6 @@ sub dbh_new {
 	# no sqlite_unicode, here, all strings are binary
 	create_tables($self, $dbh);
 	$dbh->do('PRAGMA journal_mode = WAL') if $creat;
-	$dbh->do('PRAGMA case_sensitive_like = ON');
 	$dbh;
 }
 
@@ -421,18 +421,12 @@ sub locations_for {
 
 # returns a list of folders used for completion
 sub folders {
-	my ($self, @pfx) = @_;
+	my ($self, $pfx, $anywhere) = @_;
 	my $sql = 'SELECT loc FROM folders';
 	my $re;
-	if (defined($pfx[0])) {
+	if (defined $pfx) {
 		$sql .= ' WHERE loc REGEXP ?'; # DBD::SQLite uses perlre
-		if (ref($pfx[0])) { # assume qr// "Regexp"
-			$re = $pfx[0];
-		} else {
-			$re = !!$pfx[1] ? '.*' : '';
-			$re .= quotemeta($pfx[0]);
-			$re .= '.*';
-		}
+		$re = PublicInbox::SQLiteUtil::mk_sqlite_re $pfx, $anywhere;
 	}
 	my $sth = ($self->{dbh} //= dbh_new($self))->prepare($sql);
 	$sth->bind_param(1, $re) if defined($re);
diff --git a/lib/PublicInbox/SQLiteUtil.pm b/lib/PublicInbox/SQLiteUtil.pm
new file mode 100644
index 00000000..68e0726d
--- /dev/null
+++ b/lib/PublicInbox/SQLiteUtil.pm
@@ -0,0 +1,30 @@
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# common bits for SQLite users in our codebase
+package PublicInbox::SQLiteUtil;
+use v5.12;
+
+my %SQLITE_GLOB_MAP = (
+	'[' => '[[]',
+	']' => '[]]',
+	'*' => '[*]',
+	'?' => '[?]'
+);
+
+# n.b. GLOB doesn't seem to work on data inserted w/ SQL_BLOB
+sub escape_glob ($) {
+	my ($s) = @_;
+	$s =~ s/([\[\]\*\?])/$SQLITE_GLOB_MAP{$1}/sge;
+	$s;
+}
+
+# DBD::SQLite maps REGEXP to use perlre, and that works on SQL_BLOB
+# whereas GLOB and LIKE don't seem to...
+sub mk_sqlite_re ($$) {
+	my ($pfx, $anywhere) = @_;
+	ref($pfx) ? $pfx # assume qr// Regexp
+		: ($anywhere ? '.*' : '^')."\Q$pfx\E.*";
+}
+
+1;
diff --git a/lib/PublicInbox/SharedKV.pm b/lib/PublicInbox/SharedKV.pm
index 89ab3f74..51ece48d 100644
--- a/lib/PublicInbox/SharedKV.pm
+++ b/lib/PublicInbox/SharedKV.pm
@@ -12,6 +12,7 @@ use File::Temp qw(tempdir);
 use DBI qw(:sql_types); # SQL_BLOB
 use PublicInbox::Spawn;
 use File::Path qw(rmtree);
+use PublicInbox::SQLiteUtil;
 
 sub dbh {
 	my ($self, $lock) = @_;
@@ -79,23 +80,17 @@ SELECT k,v FROM kv
 }
 
 sub keys {
-	my ($self, @pfx) = @_;
+	my ($self, $pfx, $anywhere) = @_;
+	# n.b. can't use GLOB for index optimization due to SQL_BLOB,
+	# so regexps it is.
 	my $sql = 'SELECT k FROM kv';
-	if (defined $pfx[0]) {
-		$sql .= ' WHERE k LIKE ? ESCAPE ?';
-		my $anywhere = !!$pfx[1];
-		$pfx[1] = '\\';
-		$pfx[0] =~ s/([%_\\])/\\$1/g; # glob chars
-		$pfx[0] .= '%';
-		substr($pfx[0], 0, 0, '%') if $anywhere;
-	} else {
-		@pfx = (); # [0] may've been undef
+	my $re;
+	if (defined $pfx) {
+		$sql .= ' WHERE k REGEXP ?'; # DBD::SQLite uses perlre
+		$re = PublicInbox::SQLiteUtil::mk_sqlite_re $pfx, $anywhere;
 	}
 	my $sth = $self->dbh->prepare($sql);
-	if (@pfx) {
-		$sth->bind_param(1, $pfx[0], SQL_BLOB);
-		$sth->bind_param(2, $pfx[1]);
-	}
+	$sth->bind_param(1, $re) if defined $re;
 	$sth->execute;
 	map { $_->[0] } @{$sth->fetchall_arrayref};
 }
diff --git a/t/shared_kv.t b/t/shared_kv.t
index 8dfd3b25..aa1ce4e3 100644
--- a/t/shared_kv.t
+++ b/t/shared_kv.t
@@ -46,4 +46,9 @@ $skv->dbh;
 ok($skv->set_maybe('02', '2'), "`02' set");
 ok($skv->set_maybe('2', '2'), "`2' set (no match on `02')");
 
+my @k = $skv->keys('2');
+is_deeply \@k, [ '2' ], 'prefix match on ->keys';
+@k = sort $skv->keys('2', 1);
+is_deeply \@k, [ '02', '2' ], 'anywhere match on ->keys';
+
 done_testing;
diff --git a/t/sqlite_util.t b/t/sqlite_util.t
new file mode 100644
index 00000000..f41285e0
--- /dev/null
+++ b/t/sqlite_util.t
@@ -0,0 +1,49 @@
+#!perl -w
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use v5.12;
+use PublicInbox::TestCommon;
+require_mods 'DBD::SQLite';
+use_ok 'PublicInbox::SQLiteUtil';
+require DBI;
+DBI->import(':sql_types');
+
+my $dbh = DBI->connect('dbi:SQLite:dbname=:memory:', '', '', {
+	AutoCommit => 1,
+	RaiseError => 1,
+	PrintError => 0,
+	sqlite_use_immediate_transaction => 1,
+});
+
+$dbh->do('CREATE TABLE test (key BLOB NOT NULL, UNIQUE (key))');
+
+my $ins = $dbh->prepare('INSERT INTO test (key) VALUES (?)');
+my $sel = $dbh->prepare('SELECT key FROM test WHERE key GLOB ?');
+my $non_utf8 = "h\x{e5}llo[wor]ld!";
+my $us_ascii = 'h*llo[wor]ld?';
+
+$dbh->begin_work;
+my @SQL_BLOB = (SQL_BLOB());
+@SQL_BLOB = (); # FIXME: can't get GLOB to work w/ SQL_BLOB
+for my $k ($us_ascii, $non_utf8) {
+	$ins->bind_param(1, $k, @SQL_BLOB);
+	$ins->execute;
+}
+$dbh->commit;
+
+$sel->bind_param(1, '*', @SQL_BLOB);
+$sel->execute;
+my $rows = $sel->fetchall_arrayref;
+is scalar(@$rows), 2, q[`*' got everything];
+
+$sel->bind_param(1, PublicInbox::SQLiteUtil::escape_glob($us_ascii), @SQL_BLOB);
+$sel->execute;
+$rows = $sel->fetchall_arrayref;
+is_deeply $rows, [ [ $us_ascii ] ], 'US-ASCII exact match';
+
+$sel->bind_param(1, PublicInbox::SQLiteUtil::escape_glob($non_utf8), @SQL_BLOB);
+$sel->execute;
+$rows = $sel->fetchall_arrayref;
+is_deeply $rows, [ [ $non_utf8 ] ], 'ISO-8859-1 exact match';
+
+done_testing;

  reply	other threads:[~2024-12-04 19:39 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-12-04 19:39 [PATCH 0/3] some SQLite-related things Eric Wong
2024-12-04 19:39 ` Eric Wong [this message]
2024-12-04 19:39 ` [PATCH 2/3] sqlite: use `BLOB' column type instead of `VARBINARY' Eric Wong
2024-12-04 19:39 ` [PATCH 3/3] SQLiteUtil: hoist out common create_db code Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20241204193914.3227868-2-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).