From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.0 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 81826209B1 for ; Wed, 5 Oct 2016 23:57:25 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 02/17] thread: remove Mail::Thread dependency Date: Wed, 5 Oct 2016 23:57:07 +0000 Message-Id: <20161005235722.14857-3-e@80x24.org> In-Reply-To: <20161005235722.14857-1-e@80x24.org> References: <20161005235722.14857-1-e@80x24.org> List-Id: Introduce our own SearchThread class for threading messages. This should allow us to specialize and optimize away objects in future commits. --- INSTALL | 1 - MANIFEST | 2 +- Makefile.PL | 1 - lib/PublicInbox/SearchIdx.pm | 4 +- lib/PublicInbox/SearchThread.pm | 323 ++++++++++++++++++++++++++++++++++++++++ lib/PublicInbox/SearchView.pm | 4 +- lib/PublicInbox/Thread.pm | 86 ----------- lib/PublicInbox/View.pm | 4 +- lib/PublicInbox/WWW.pm | 2 +- t/plack.t | 3 +- 10 files changed, 332 insertions(+), 98 deletions(-) create mode 100644 lib/PublicInbox/SearchThread.pm delete mode 100644 lib/PublicInbox/Thread.pm diff --git a/INSTALL b/INSTALL index 5851892..3a2f840 100644 --- a/INSTALL +++ b/INSTALL @@ -37,7 +37,6 @@ Optional components: Optional Perl modules: - Plack[1] libplack-perl - - Mail::Thread (2.5+)[1] libmail-thread-perl - URI::Escape[1] liburi-perl - Search::Xapian[2][3] libsearch-xapian-perl - IO::Compress::Gzip[3] perl-modules (or libio-compress-perl) diff --git a/MANIFEST b/MANIFEST index c39fa26..bcc4121 100644 --- a/MANIFEST +++ b/MANIFEST @@ -78,11 +78,11 @@ lib/PublicInbox/SaPlugin/ListMirror.pm lib/PublicInbox/Search.pm lib/PublicInbox/SearchIdx.pm lib/PublicInbox/SearchMsg.pm +lib/PublicInbox/SearchThread.pm lib/PublicInbox/SearchView.pm lib/PublicInbox/Spamcheck/Spamc.pm lib/PublicInbox/Spawn.pm lib/PublicInbox/SpawnPP.pm -lib/PublicInbox/Thread.pm lib/PublicInbox/Unsubscribe.pm lib/PublicInbox/View.pm lib/PublicInbox/WWW.pm diff --git a/Makefile.PL b/Makefile.PL index 4a91103..0bac7c9 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -22,7 +22,6 @@ WriteMakefile( 'Email::MIME::ContentType' => 0, 'Email::Simple' => 0, 'Encode::MIME::Header' => 0, - 'Mail::Thread' => '2.5', # 2.5+ needed for Email::Simple compat 'Plack' => 0, 'URI::Escape' => 0, # We have more test dependencies, but do not force diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 23aef9f..4aac028 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -4,8 +4,8 @@ # # Indexes mail with Xapian and our (SQLite-based) ::Msgmap for use # with the web and NNTP interfaces. This index maintains thread -# relationships for use by Mail::Thread. This writes to the search -# index. +# relationships for use by PublicInbox::SearchThread. +# This writes to the search index. package PublicInbox::SearchIdx; use strict; use warnings; diff --git a/lib/PublicInbox/SearchThread.pm b/lib/PublicInbox/SearchThread.pm new file mode 100644 index 0000000..41fe859 --- /dev/null +++ b/lib/PublicInbox/SearchThread.pm @@ -0,0 +1,323 @@ +# This library is free software; you can redistribute it and/or modify +# it under the same terms as Perl itself. +# +# This license differs from the rest of public-inbox +# +# Our own jwz-style threading class based on Mail::Thread from CPAN. +# Mail::Thread is unmaintained and available on some distros. +# We also do not want pruning or subject grouping, since we want +# to encourage strict threading and hopefully encourage people +# to use proper In-Reply-To. +# +# This includes fixes from several open bugs for Mail::Thread +# +# Avoid circular references +# - https://rt.cpan.org/Public/Bug/Display.html?id=22817 +# +# And avoid recursion in recurse_down: +# - https://rt.cpan.org/Ticket/Display.html?id=116727 +# - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=833479 +package PublicInbox::SearchThread; +use strict; +use warnings; +use Email::Abstract; + +sub new { + return bless { + messages => $_[1], + id_table => {}, + rootset => [] + }, $_[0]; +} + +sub _get_hdr { + my ($class, $msg, $hdr) = @_; + Email::Abstract->get_header($msg, $hdr) || ''; +} + +sub _uniq { + my %seen; + return grep { !$seen{$_}++ } @_; +} + +sub _references { + my $class = shift; + my $msg = shift; + my @references = ($class->_get_hdr($msg, "References") =~ /<([^>]+)>/g); + my $foo = $class->_get_hdr($msg, "In-Reply-To"); + chomp $foo; + $foo =~ s/.*?<([^>]+)>.*/$1/; + push @references, $foo + if $foo =~ /^\S+\@\S+$/ && (!@references || $references[-1] ne $foo); + return _uniq(@references); +} + +sub _msgid { + my ($class, $msg) = @_; + my $id = $class->_get_hdr($msg, "Message-ID"); + die "attempt to thread message with no id" unless $id; + chomp $id; + $id =~ s/^<([^>]+)>.*/$1/; # We expect this not to have <>s + return $id; +} + +sub rootset { @{$_[0]{rootset}} } + +sub thread { + my $self = shift; + $self->_setup(); + $self->{rootset} = [ grep { !$_->parent } values %{$self->{id_table}} ]; + $self->_finish(); +} + +sub _finish { + my $self = shift; + delete $self->{id_table}; + delete $self->{seen}; +} + +sub _get_cont_for_id { + my $self = shift; + my $id = shift; + $self->{id_table}{$id} ||= $self->_container_class->new($id); +} + +sub _container_class { 'PublicInbox::SearchThread::Container' } + +sub _setup { + my ($self) = @_; + + _add_message($self, $_) foreach @{$self->{messages}}; +} + +sub _add_message ($$) { + my ($self, $message) = @_; + + # A. if id_table... + my $this_container = $self->_get_cont_for_id($self->_msgid($message)); + $this_container->message($message); + + # B. For each element in the message's References field: + my @refs = $self->_references($message); + + my $prev; + for my $ref (@refs) { + # Find a Container object for the given Message-ID + my $container = $self->_get_cont_for_id($ref); + + # Link the References field's Containers together in the + # order implied by the References header + # * If they are already linked don't change the existing links + # * Do not add a link if adding that link would introduce + # a loop... + + if ($prev && + !$container->parent && # already linked + !$container->has_descendent($prev) # would loop + ) { + $prev->add_child($container); + } + $prev = $container; + } + + # C. Set the parent of this message to be the last element in + # References... + if ($prev && + !$this_container->has_descendent($prev) # would loop + ) { + $prev->add_child($this_container) + } +} + +sub order { + my $self = shift; + my $ordersub = shift; + + # make a fake root + my $root = $self->_container_class->new( 'fakeroot' ); + $root->add_child( $_ ) for @{ $self->{rootset} }; + + # sort it + $root->order_children( $ordersub ); + + # and untangle it + my @kids = $root->children; + $self->{rootset} = \@kids; + $root->remove_child($_) for @kids; +} + +package PublicInbox::SearchThread::Container; +use Carp qw(croak); +use Scalar::Util qw(weaken); + +sub new { my $self = shift; bless { id => shift }, $self; } + +sub message { $_[0]->{message} = $_[1] if @_ == 2; $_[0]->{message} } +sub parent { @_ == 2 ? weaken($_[0]->{parent} = $_[1]) : $_[0]->{parent} } +sub child { $_[0]->{child} = $_[1] if @_ == 2; $_[0]->{child} } +sub next { $_[0]->{next} = $_[1] if @_ == 2; $_[0]->{next} } +sub messageid { $_[0]->{id} } + +sub add_child { + my ($self, $child) = @_; + croak "Cowardly refusing to become my own parent: $self" + if $self == $child; + + if (grep { $_ == $child } $self->children) { + # All is potentially correct with the world + $child->parent($self); + return; + } + + $child->parent->remove_child($child) if $child->parent; + + $child->next($self->child); + $self->child($child); + $child->parent($self); +} + +sub remove_child { + my ($self, $child) = @_; + return unless $self->child; + if ($self->child == $child) { # First one's easy. + $self->child($child->next); + $child->next(undef); + $child->parent(undef); + return; + } + + my $x = $self->child; + my $prev = $x; + while ($x = $x->next) { + if ($x == $child) { + $prev->next($x->next); # Unlink x + $x->next(undef); + $x->parent(undef); # Deparent it + return; + } + $prev = $x; + } + # oddly, we can get here + $child->next(undef); + $child->parent(undef); +} + +sub has_descendent { + my $self = shift; + my $child = shift; + die "Assertion failed: $child" unless eval {$child}; + my $there = 0; + $self->recurse_down(sub { $there = 1 if $_[0] == $child }); + + return $there; +} + +sub children { + my $self = shift; + my @children; + my $visitor = $self->child; + while ($visitor) { + push @children, $visitor; + $visitor = $visitor->next + } + return @children; +} + +sub set_children { + my $self = shift; + my $walk = $self->child( shift ); + while (@_) { $walk = $walk->next( shift ) } + $walk->next(undef) if $walk; +} + +sub order_children { + my $self = shift; + my $ordersub = shift; + + return unless $ordersub; + + my $sub = sub { + my $cont = shift; + my @children = $cont->children; + return if @children < 2; + $cont->set_children( $ordersub->( @children ) ); + }; + $self->iterate_down( undef, $sub ); + undef $sub; +} + +# non-recursive version of recurse_down to avoid stack depth warnings +sub recurse_down { + my ($self, $callback) = @_; + my %seen; + my @q = ($self); + while (my $cont = shift @q) { + $seen{$cont}++; + $callback->($cont); + + if (my $next = $cont->next) { + if ($seen{$next}) { + $cont->next(undef); + } else { + push @q, $next; + } + } + if (my $child = $cont->child) { + if ($seen{$child}) { + $cont->child(undef); + } else { + push @q, $child; + } + } + } +} + +sub iterate_down { + my $self = shift; + my ($before, $after) = @_; + + my %seen; + my $walk = $self; + my $depth = 0; + my @visited; + while ($walk) { + push @visited, [ $walk, $depth ]; + $before->($walk, $depth) if $before; + + # spot/break loops + $seen{$walk}++; + + my $child = $walk->child; + if ($child && $seen{$child}) { + $walk->child(undef); + $child = undef; + } + + my $next = $walk->next; + if ($next && $seen{$next}) { + $walk->next(undef); + $next = undef; + } + + # go down, or across + if ($child) { + $next = $child; + ++$depth; + } + + # no next? look up + if (!$next) { + my $up = $walk; + while ($up && !$next) { + $up = $up->parent; + --$depth; + $next = $up->next if $up; + } + } + $walk = $next; + } + return unless $after; + while (@visited) { $after->(@{ pop @visited }) } +} + +1; diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm index 4f0811a..da31109 100644 --- a/lib/PublicInbox/SearchView.pm +++ b/lib/PublicInbox/SearchView.pm @@ -11,7 +11,7 @@ use PublicInbox::View; use PublicInbox::MID qw(mid2path mid_mime mid_clean mid_escape); use Email::MIME; require PublicInbox::Git; -require PublicInbox::Thread; +require PublicInbox::SearchThread; our $LIM = 50; sub noop {} @@ -152,7 +152,7 @@ sub mset_thread { $m; } ($mset->items); - my $th = PublicInbox::Thread->new(@m); + my $th = PublicInbox::SearchThread->new(\@m); $th->thread; if ($q->{r}) { # order by relevance $th->order(sub { diff --git a/lib/PublicInbox/Thread.pm b/lib/PublicInbox/Thread.pm deleted file mode 100644 index 8af9461..0000000 --- a/lib/PublicInbox/Thread.pm +++ /dev/null @@ -1,86 +0,0 @@ -# subclass Mail::Thread and use this to workaround a memory leak -# Based on the patch in: https://rt.cpan.org/Public/Bug/Display.html?id=22817 -# -# Additionally, workaround for a bug where $walk->topmost returns undef: -# - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=795913 -# - https://rt.cpan.org/Ticket/Display.html?id=106498 -# -# And avoid recursion in recurse_down: -# - https://rt.cpan.org/Ticket/Display.html?id=116727 -# - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=833479 -# -# License differs from the rest of public-inbox (but is compatible): -# This library is free software; you can redistribute it and/or modify -# it under the same terms as Perl itself. -package PublicInbox::Thread; -use strict; -use warnings; -use base qw(Mail::Thread); -# WARNING! both these Mail::Thread knobs were found by inspecting -# the Mail::Thread 2.55 source code, and we have some monkey patches -# in PublicInbox::Thread to fix memory leaks. Since Mail::Thread -# appears unmaintained, I suppose it's safe to depend on these -# variables for now: -{ - no warnings 'once'; - # we want strict threads to expose (and hopefully discourage) - # use of broken email clients - $Mail::Thread::nosubject = 1; - # Keep ghosts with only a single direct child, - # don't hide that there may be missing messages. - $Mail::Thread::noprune = 1; -} - -if ($Mail::Thread::VERSION <= 2.55) { - eval q(sub _container_class { 'PublicInbox::Thread::Container' }); -} - -package PublicInbox::Thread::Container; -use strict; -use warnings; -use base qw(Mail::Thread::Container); -use Scalar::Util qw(weaken); -sub parent { @_ == 2 ? weaken($_[0]->{parent} = $_[1]) : $_[0]->{parent} } - -sub topmost { - $_[0]->SUPER::topmost || PublicInbox::Thread::CPANRTBug106498->new; -} - -# non-recursive version of recurse_down to avoid stack depth warnings -sub recurse_down { - my ($self, $callback) = @_; - my %seen; - my @q = ($self); - while (my $cont = shift @q) { - $seen{$cont}++; - $callback->($cont); - - if (my $next = $cont->next) { - if ($seen{$next}) { - $cont->next(undef); - } else { - push @q, $next; - } - } - if (my $child = $cont->child) { - if ($seen{$child}) { - $cont->child(undef); - } else { - push @q, $child; - } - } - } -} - -# ref: -# - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=795913 -# - https://rt.cpan.org/Ticket/Display.html?id=106498 -package PublicInbox::Thread::CPANRTBug106498; -use strict; -use warnings; - -sub new { bless {}, $_[0] } - -sub simple_subject {} - -1; diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index a3b2681..9f1bf46 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -749,8 +749,8 @@ sub msg_timestamp { sub thread_results { my ($msgs) = @_; - require PublicInbox::Thread; - my $th = PublicInbox::Thread->new(@$msgs); + require PublicInbox::SearchThread; + my $th = PublicInbox::SearchThread->new($msgs); $th->thread; $th->order(*sort_ts); $th diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index 4d599fc..11fc92e 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -112,7 +112,7 @@ sub call { sub preload { require PublicInbox::Feed; require PublicInbox::View; - require PublicInbox::Thread; + require PublicInbox::SearchThread; require Email::MIME; require Digest::SHA; require POSIX; diff --git a/t/plack.t b/t/plack.t index 608afb9..1d62458 100644 --- a/t/plack.t +++ b/t/plack.t @@ -11,8 +11,7 @@ my $pi_config = "$tmpdir/config"; my $maindir = "$tmpdir/main.git"; my $addr = 'test-public@example.com'; my $cfgpfx = "publicinbox.test"; -my @mods = qw(HTTP::Request::Common Plack::Test - Mail::Thread URI::Escape); +my @mods = qw(HTTP::Request::Common Plack::Test URI::Escape); foreach my $mod (@mods) { eval "require $mod"; plan skip_all => "$mod missing for plack.t" if $@; -- EW