unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
* [PATCH] switch to gzipped mboxes
@ 2015-08-21 23:31 Eric Wong
  0 siblings, 0 replies; only message in thread
From: Eric Wong @ 2015-08-21 23:31 UTC (permalink / raw)
  To: meta

Mboxes may be huge, so only support downloading gzipped mboxes
to save bandwidth and to get free checksumming.

Streaming output means we should not be wasting too much memory
on this unless the chosen server sucks.
---
 lib/PublicInbox/Mbox.pm | 64 ++++++++++++++++++++++++++++++++++++++++++++++---
 lib/PublicInbox/WWW.pm  |  8 +++++--
 t/cgi.t                 | 11 ++++++---
 3 files changed, 75 insertions(+), 8 deletions(-)

diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm
index fc9df1a..cb9d65d 100644
--- a/lib/PublicInbox/Mbox.pm
+++ b/lib/PublicInbox/Mbox.pm
@@ -11,8 +11,7 @@ sub thread_mbox {
 	my ($ctx, $srch) = @_;
 	sub {
 		my ($response) = @_; # Plack callback
-		my $w = $response->([200, ['Content-Type' => 'text/plain']]);
-		emit_mbox($w, $ctx, $srch);
+		emit_mbox($response, $ctx, $srch);
 	}
 }
 
@@ -40,7 +39,14 @@ sub emit_msg {
 }
 
 sub emit_mbox {
-	my ($fh, $ctx, $srch) = @_;
+	my ($response, $ctx, $srch) = @_;
+	eval { require IO::Compress::Gzip };
+	return need_gzip($response) if $@;
+
+	# http://www.iana.org/assignments/media-types/application/gzip
+	# http://www.iana.org/assignments/media-types/application/mbox
+	my $fh = $response->([200, ['Content-Type' => 'application/gzip']]);
+	$fh = PublicInbox::MboxGz->new($fh);
 
 	require PublicInbox::GitCatFile;
 	require Email::Simple;
@@ -62,6 +68,58 @@ sub emit_mbox {
 
 		$opts{offset} += $nr;
 	} while ($nr > 0);
+
+	$fh->close;
+}
+
+sub need_gzip {
+	my $fh = $_[0]->([501, ['Content-Type' => 'text/html']]);
+	my $title = 'gzipped mbox not available';
+	$fh->write(<<EOF);
+<html><head><title>$title</title><body><pre>$title
+The administrator needs to install the IO::Compress::Gzip Perl module
+to support gzipped mboxes.
+<a href="../">Return to index</a></pre></body></html>
+EOF
+}
+
+1;
+
+# fh may not be a proper IO, so we wrap the write and close methods
+# to prevent IO::Compress::Gzip from complaining
+package PublicInbox::MboxGz;
+use strict;
+use warnings;
+use fields qw(gz fh buf);
+
+sub new {
+	my ($class, $fh) = @_;
+	my $self = fields::new($class);
+	my $buf;
+	$self->{buf} = \$buf;
+	$self->{gz} = IO::Compress::Gzip->new(\$buf);
+	$self->{fh} = $fh;
+	$self;
+}
+
+sub _flush_buf {
+	my ($self) = @_;
+	if (defined ${$self->{buf}}) {
+		$self->{fh}->write(${$self->{buf}});
+		${$self->{buf}} = undef;
+	}
+}
+
+sub write {
+	$_[0]->{gz}->write($_[1]);
+	_flush_buf($_[0]);
+}
+
+sub close {
+	my ($self) = @_;
+	$self->{gz}->close;
+	_flush_buf($self);
+	# do not actually close $fh
 }
 
 1;
diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm
index cd8a570..68839d7 100644
--- a/lib/PublicInbox/WWW.pm
+++ b/lib/PublicInbox/WWW.pm
@@ -53,7 +53,8 @@ sub run {
 	} elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)\.html\z!o) {
 		invalid_list_mid(\%ctx, $1, $2) || get_thread(\%ctx, $cgi);
 
-	} elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)\.mbox\z!o) {
+	} elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)\.mbox\.gz!o) {
+		my $sfx = $3;
 		invalid_list_mid(\%ctx, $1, $2) || get_thread_mbox(\%ctx, $cgi);
 
 	} elsif ($path_info =~ m!$LISTNAME_RE/f/\S+\.txt\z!o) {
@@ -329,7 +330,10 @@ sub msg_pfx {
 	"../f/$href.html";
 }
 
-# /$LISTNAME/t/$MESSAGE_ID.mbox                    -> search results as mbox
+# /$LISTNAME/t/$MESSAGE_ID.mbox.gz        -> search results as gzipped mbox
+# note: I'm not a big fan of other compression formats since they're
+# significantly more expensive on CPU than gzip and less-widely available,
+# especially on older systems.  Stick to zlib since that's what git uses.
 sub get_thread_mbox {
 	my ($ctx, $cgi) = @_;
 	my $srch = searcher($ctx) or return need_search($ctx);
diff --git a/t/cgi.t b/t/cgi.t
index 2747a15..e87f7dc 100644
--- a/t/cgi.t
+++ b/t/cgi.t
@@ -183,15 +183,20 @@ EOF
 {
 	local $ENV{HOME} = $home;
 	local $ENV{PATH} = $main_path;
-	my $path = "/test/t/blahblah%40example.com.mbox";
+	my $path = "/test/t/blahblah%40example.com.mbox.gz";
 	my $res = cgi_run($path);
 	like($res->{head}, qr/^Status: 501 /, "search not-yet-enabled");
 	my $indexed = system($index, $maindir) == 0;
 	if ($indexed) {
 		$res = cgi_run($path);
-		# use Data::Dumper; print STDERR Dumper($res);
 		like($res->{head}, qr/^Status: 200 /, "search returned mbox");
-		like($res->{body}, qr/^From /m, "From lines in mbox");
+		eval {
+			require IO::Uncompress::Gunzip;
+			my $in = $res->{body};
+			my $out;
+			IO::Uncompress::Gunzip::gunzip(\$in => \$out);
+			like($out, qr/^From /m, "From lines in mbox");
+		};
 	} else {
 		like($res->{head}, qr/^Status: 501 /, "search not available");
 	}
-- 
EW


^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2015-08-21 23:31 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-08-21 23:31 [PATCH] switch to gzipped mboxes Eric Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).