unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
From: "Eric Wong (Contractor, The Linux Foundation)" <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 05/13] use both Date: and Received: times
Date: Thu, 22 Mar 2018 09:40:07 +0000	[thread overview]
Message-ID: <20180322094015.14422-6-e@80x24.org> (raw)
In-Reply-To: <20180322094015.14422-1-e@80x24.org>

We want to rely on Date: to sort messages within individual
threads since it keeps messages from git-send-email(1) sorted.
However, since developers occasionally have the clock set
wrong on their machines, sort overall messages by the newest
date in a Received: header so the landing page isn't forever
polluted by messages from the future.

This also gives us determinism for commit times in most cases,
as we'll used the Received: timestamp there, as well.
---
 lib/PublicInbox/Import.pm            | 17 ++++----
 lib/PublicInbox/MsgTime.pm           | 80 ++++++++++++++++++++++++------------
 lib/PublicInbox/Search.pm            |  5 ++-
 lib/PublicInbox/SearchIdx.pm         |  6 ++-
 lib/PublicInbox/SearchIdxSkeleton.pm |  4 +-
 lib/PublicInbox/SearchMsg.pm         | 26 ++++++------
 lib/PublicInbox/SearchView.pm        |  6 +--
 lib/PublicInbox/View.pm              | 30 +++++++-------
 8 files changed, 103 insertions(+), 71 deletions(-)

diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index e50f115..d69934b 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -11,7 +11,7 @@ use base qw(PublicInbox::Lock);
 use PublicInbox::Spawn qw(spawn);
 use PublicInbox::MID qw(mids mid_mime mid2path);
 use PublicInbox::Address;
-use PublicInbox::MsgTime qw(msg_timestamp);
+use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
 use PublicInbox::ContentId qw(content_digest);
 use PublicInbox::MDA;
 
@@ -244,9 +244,8 @@ sub remove {
 	(($self->{tip} = ":$commit"), $cur);
 }
 
-sub parse_date ($) {
-	my ($mime) = @_;
-	my ($ts, $zone) = msg_timestamp($mime->header_obj);
+sub git_timestamp {
+	my ($ts, $zone) = @_;
 	$ts = 0 if $ts < 0; # git uses unsigned times
 	"$ts $zone";
 }
@@ -295,7 +294,11 @@ sub add {
 	my ($self, $mime, $check_cb) = @_; # mime = Email::MIME
 
 	my ($name, $email) = extract_author_info($mime);
-	my $date_raw = parse_date($mime);
+	my $hdr = $mime->header_obj;
+	my @at = msg_datestamp($hdr);
+	my @ct = msg_timestamp($hdr);
+	my $author_time_raw = git_timestamp(@at);
+	my $commit_time_raw = git_timestamp(@ct);
 	my $subject = $mime->header('Subject');
 	$subject = '(no subject)' unless defined $subject;
 	my $path_type = $self->{path_type};
@@ -349,8 +352,8 @@ sub add {
 
 	utf8::encode($subject);
 	print $w "commit $ref\nmark :$commit\n",
-		"author $name <$email> $date_raw\n",
-		"committer $self->{ident} ", now_raw(), "\n" or wfail;
+		"author $name <$email> $author_time_raw\n",
+		"committer $self->{ident} $commit_time_raw\n" or wfail;
 	print $w "data ", (length($subject) + 1), "\n",
 		$subject, "\n\n" or wfail;
 	if ($tip ne '') {
diff --git a/lib/PublicInbox/MsgTime.pm b/lib/PublicInbox/MsgTime.pm
index 87664f4..4295e87 100644
--- a/lib/PublicInbox/MsgTime.pm
+++ b/lib/PublicInbox/MsgTime.pm
@@ -4,48 +4,76 @@ package PublicInbox::MsgTime;
 use strict;
 use warnings;
 use base qw(Exporter);
-our @EXPORT_OK = qw(msg_timestamp);
+our @EXPORT_OK = qw(msg_timestamp msg_datestamp);
 use Date::Parse qw(str2time);
 use Time::Zone qw(tz_offset);
 
-sub msg_timestamp ($) {
+sub zone_clamp ($) {
+	my ($zone) = @_;
+	$zone ||= '+0000';
+	# "-1200" is the furthest westermost zone offset,
+	# but git fast-import is liberal so we use "-1400"
+	if ($zone >= 1400 || $zone <= -1400) {
+		warn "bogus TZ offset: $zone, ignoring and assuming +0000\n";
+		$zone = '+0000';
+	}
+	$zone;
+}
+
+sub time_response ($) {
+	my ($ret) = @_;
+	wantarray ? @$ret : $ret->[0];
+}
+
+sub msg_received_at ($) {
 	my ($hdr) = @_; # Email::MIME::Header
-	my ($ts, $zone);
-	my $mid;
 	my @recvd = $hdr->header_raw('Received');
+	my ($ts, $zone);
 	foreach my $r (@recvd) {
 		$zone = undef;
 		$r =~ /\s*(\d+\s+[[:alpha:]]+\s+\d{2,4}\s+
 			\d+\D\d+(?:\D\d+)\s+([\+\-]\d+))/sx or next;
 		$zone = $2;
 		$ts = eval { str2time($1) } and last;
-		$mid ||= $hdr->header_raw('Message-ID');
+		my $mid = $hdr->header_raw('Message-ID');
 		warn "no date in $mid Received: $r\n";
 	}
-	unless (defined $ts) {
-		my @date = $hdr->header_raw('Date');
-		foreach my $d (@date) {
-			$zone = undef;
-			$ts = eval { str2time($d) };
-			if ($@) {
-				$mid ||= $hdr->header_raw('Message-ID');
-				warn "bad Date: $d in $mid: $@\n";
-			} elsif ($d =~ /\s+([\+\-]\d+)\s*\z/) {
-				$zone = $1;
-			}
+	defined $ts ? [ $ts, zone_clamp($zone) ] : undef;
+}
+
+sub msg_date_only ($) {
+	my ($hdr) = @_; # Email::MIME::Header
+	my @date = $hdr->header_raw('Date');
+	my ($ts, $zone);
+	foreach my $d (@date) {
+		$zone = undef;
+		$ts = eval { str2time($d) };
+		if ($@) {
+			my $mid = $hdr->header_raw('Message-ID');
+			warn "bad Date: $d in $mid: $@\n";
+		} elsif ($d =~ /\s+([\+\-]\d+)\s*\z/) {
+			$zone = $1;
 		}
 	}
-	$ts = time unless defined $ts;
-	return $ts unless wantarray;
+	defined $ts ? [ $ts, zone_clamp($zone) ] : undef;
+}
 
-	$zone ||= '+0000';
-	# "-1200" is the furthest westermost zone offset,
-	# but git fast-import is liberal so we use "-1400"
-	if ($zone >= 1400 || $zone <= -1400) {
-		warn "bogus TZ offset: $zone, ignoring and assuming +0000\n";
-		$zone = '+0000';
-	}
-	($ts, $zone);
+# Favors Received header for sorting globally
+sub msg_timestamp ($) {
+	my ($hdr) = @_; # Email::MIME::Header
+	my $ret;
+	$ret = msg_received_at($hdr) and return time_response($ret);
+	$ret = msg_date_only($hdr) and return time_response($ret);
+	wantarray ? (time, '+0000') : time;
+}
+
+# Favors the Date: header for display and sorting within a thread
+sub msg_datestamp ($) {
+	my ($hdr) = @_; # Email::MIME::Header
+	my $ret;
+	$ret = msg_date_only($hdr) and return time_response($ret);
+	$ret = msg_received_at($hdr) and return time_response($ret);
+	wantarray ? (time, '+0000') : time;
 }
 
 1;
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 7e7c989..f08b987 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -8,11 +8,12 @@ use strict;
 use warnings;
 
 # values for searching
-use constant TS => 0; # timestamp
+use constant DS => 0; # Date: header in Unix time
 use constant NUM => 1; # NNTP article number
 use constant BYTES => 2; # :bytes as defined in RFC 3977
 use constant LINES => 3; # :lines as defined in RFC 3977
-use constant YYYYMMDD => 4; # for searching in the WWW UI
+use constant TS => 4;  # Received: header in Unix time
+use constant YYYYMMDD => 5; # for searching in the WWW UI
 
 use Search::Xapian qw/:standard/;
 use PublicInbox::SearchMsg;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 3d80b00..ef723a4 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -134,7 +134,9 @@ sub add_values ($$) {
 	my $lines = $values->[PublicInbox::Search::LINES];
 	add_val($doc, PublicInbox::Search::LINES, $lines);
 
-	my $yyyymmdd = strftime('%Y%m%d', gmtime($ts));
+	my $ds = $values->[PublicInbox::Search::DS];
+	add_val($doc, PublicInbox::Search::DS, $ds);
+	my $yyyymmdd = strftime('%Y%m%d', gmtime($ds));
 	add_val($doc, PublicInbox::Search::YYYYMMDD, $yyyymmdd);
 }
 
@@ -298,7 +300,7 @@ sub add_message {
 		}
 
 		my $lines = $mime->body_raw =~ tr!\n!\n!;
-		my @values = ($smsg->ts, $num, $bytes, $lines);
+		my @values = ($smsg->ds, $num, $bytes, $lines, $smsg->ts);
 		add_values($doc, \@values);
 
 		my $tg = $self->term_generator;
diff --git a/lib/PublicInbox/SearchIdxSkeleton.pm b/lib/PublicInbox/SearchIdxSkeleton.pm
index ba43969..78a1730 100644
--- a/lib/PublicInbox/SearchIdxSkeleton.pm
+++ b/lib/PublicInbox/SearchIdxSkeleton.pm
@@ -121,18 +121,16 @@ sub remote_remove {
 	die $err if $err;
 }
 
-# values: [ TS, NUM, BYTES, LINES, MID, XPATH, doc_data ]
+# values: [ DS, NUM, BYTES, LINES, TS, MIDS, XPATH, doc_data ]
 sub index_skeleton_real ($$) {
 	my ($self, $values) = @_;
 	my $doc_data = pop @$values;
 	my $xpath = pop @$values;
 	my $mids = pop @$values;
-	my $ts = $values->[PublicInbox::Search::TS];
 	my $smsg = PublicInbox::SearchMsg->new(undef);
 	my $doc = $smsg->{doc};
 	PublicInbox::SearchIdx::add_values($doc, $values);
 	$doc->set_data($doc_data);
-	$smsg->{ts} = $ts;
 	$smsg->load_from_data($doc_data);
 	my $num = $values->[PublicInbox::Search::NUM];
 	my @refs = ($smsg->references =~ /<([^>]+)>/g);
diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm
index a1cd0c2..de1fd13 100644
--- a/lib/PublicInbox/SearchMsg.pm
+++ b/lib/PublicInbox/SearchMsg.pm
@@ -9,7 +9,7 @@ use warnings;
 use Search::Xapian;
 use PublicInbox::MID qw/mid_clean mid_mime/;
 use PublicInbox::Address;
-use PublicInbox::MsgTime qw(msg_timestamp);
+use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
 
 sub new {
 	my ($class, $mime) = @_;
@@ -46,6 +46,7 @@ sub load_expand {
 	my $doc = $self->{doc};
 	my $data = $doc->get_data or return;
 	$self->{ts} = get_val($doc, &PublicInbox::Search::TS);
+	$self->{ds} = get_val($doc, &PublicInbox::Search::DS);
 	utf8::decode($data);
 	load_from_data($self, $data);
 	$self;
@@ -53,12 +54,8 @@ sub load_expand {
 
 sub load_doc {
 	my ($class, $doc) = @_;
-	my $data = $doc->get_data or return;
-	my $ts = get_val($doc, &PublicInbox::Search::TS);
-	utf8::decode($data);
-	my $self = bless { doc => $doc, ts => $ts }, $class;
-	load_from_data($self, $data);
-	$self
+	my $self = bless { doc => $doc }, $class;
+	$self->load_expand;
 }
 
 # :bytes and :lines metadata in RFC 3977
@@ -91,9 +88,9 @@ my @MoY = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
 
 sub date ($) {
 	my ($self) = @_;
-	my $ts = $self->{ts};
-	return unless defined $ts;
-	my ($sec, $min, $hour, $mday, $mon, $year, $wday) = gmtime($ts);
+	my $ds = $self->{ds};
+	return unless defined $ds;
+	my ($sec, $min, $hour, $mday, $mon, $year, $wday) = gmtime($ds);
 	"$DoW[$wday], " . sprintf("%02d $MoY[$mon] %04d %02d:%02d:%02d +0000",
 				$mday, $year+1900, $hour, $min, $sec);
 
@@ -119,9 +116,12 @@ sub from_name {
 
 sub ts {
 	my ($self) = @_;
-	$self->{ts} ||= eval {
-		msg_timestamp($self->{mime}->header_obj);
-	} || 0;
+	$self->{ts} ||= eval { msg_timestamp($self->{mime}->header_obj) } || 0;
+}
+
+sub ds {
+	my ($self) = @_;
+	$self->{ds} ||= eval { msg_datestamp($self->{mime}->header_obj); } || 0;
 }
 
 sub to_doc_data {
diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm
index 53b88c3..55c588c 100644
--- a/lib/PublicInbox/SearchView.pm
+++ b/lib/PublicInbox/SearchView.pm
@@ -117,11 +117,11 @@ sub mset_summary {
 			obfuscate_addrs($obfs_ibx, $s);
 			obfuscate_addrs($obfs_ibx, $f);
 		}
-		my $ts = PublicInbox::View::fmt_ts($smsg->ts);
+		my $date = PublicInbox::View::fmt_ts($smsg->ds);
 		my $mid = PublicInbox::Hval->new_msgid($smsg->mid)->{href};
 		$$res .= qq{$rank. <b><a\nhref="$mid/">}.
 			$s . "</a></b>\n";
-		$$res .= "$pfx  - by $f @ $ts UTC [$pct%]\n\n";
+		$$res .= "$pfx  - by $f @ $date UTC [$pct%]\n\n";
 	}
 	$$res .= search_nav_bot($mset, $q);
 	*noop;
@@ -227,7 +227,7 @@ sub mset_thread {
 	} ($mset->items) ]});
 	my $r = $q->{r};
 	my $rootset = PublicInbox::SearchThread::thread($msgs,
-		$r ? sort_relevance(\%pct) : *PublicInbox::View::sort_ts,
+		$r ? sort_relevance(\%pct) : *PublicInbox::View::sort_ds,
 		$srch);
 	my $skel = search_nav_bot($mset, $q). "<pre>";
 	my $inbox = $ctx->{-inbox};
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index f811f4f..18882af 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -6,7 +6,7 @@
 package PublicInbox::View;
 use strict;
 use warnings;
-use PublicInbox::MsgTime qw(msg_timestamp);
+use PublicInbox::MsgTime qw(msg_datestamp);
 use PublicInbox::Hval qw/ascii_html obfuscate_addrs/;
 use PublicInbox::Linkify;
 use PublicInbox::MID qw/mid_clean id_compress mid_mime mid_escape/;
@@ -735,7 +735,7 @@ sub load_results {
 sub thread_results {
 	my ($msgs, $srch) = @_;
 	require PublicInbox::SearchThread;
-	PublicInbox::SearchThread::thread($msgs, *sort_ts, $srch);
+	PublicInbox::SearchThread::thread($msgs, *sort_ds, $srch);
 }
 
 sub missing_thread {
@@ -746,7 +746,7 @@ sub missing_thread {
 
 sub _msg_date {
 	my ($hdr) = @_;
-	fmt_ts(msg_timestamp($hdr));
+	fmt_ts(msg_datestamp($hdr));
 }
 
 sub fmt_ts { POSIX::strftime('%Y-%m-%d %k:%M', gmtime($_[0])) }
@@ -782,7 +782,7 @@ sub skel_dump {
 	my $obfs_ibx = $ctx->{-obfs_ibx};
 	obfuscate_addrs($obfs_ibx, $f) if $obfs_ibx;
 
-	my $d = fmt_ts($smsg->{ts}) . ' ' . indent_for($level) . th_pfx($level);
+	my $d = fmt_ts($smsg->{ds}) . ' ' . indent_for($level) . th_pfx($level);
 	my $attr = $f;
 	$ctx->{first_level} ||= $level;
 
@@ -863,10 +863,10 @@ sub _skel_ghost {
 	$$dst .= $d;
 }
 
-sub sort_ts {
+sub sort_ds {
 	[ sort {
-		(eval { $a->topmost->{smsg}->ts } || 0) <=>
-		(eval { $b->topmost->{smsg}->ts } || 0)
+		(eval { $a->topmost->{smsg}->ds } || 0) <=>
+		(eval { $b->topmost->{smsg}->ds } || 0)
 	} @{$_[0]} ];
 }
 
@@ -877,21 +877,21 @@ sub acc_topic {
 	my $srch = $ctx->{srch};
 	my $mid = $node->{id};
 	my $x = $node->{smsg} || $srch->lookup_mail($mid);
-	my ($subj, $ts);
+	my ($subj, $ds);
 	my $topic;
 	if ($x) {
 		$subj = $x->subject;
 		$subj = $srch->subject_normalized($subj);
-		$ts = $x->ts;
+		$ds = $x->ds;
 		if ($level == 0) {
-			$topic = [ $ts, 1, { $subj => $mid }, $subj ];
+			$topic = [ $ds, 1, { $subj => $mid }, $subj ];
 			$ctx->{-cur_topic} = $topic;
 			push @{$ctx->{order}}, $topic;
 			return;
 		}
 
 		$topic = $ctx->{-cur_topic}; # should never be undef
-		$topic->[0] = $ts if $ts > $topic->[0];
+		$topic->[0] = $ds if $ds > $topic->[0];
 		$topic->[1]++;
 		my $seen = $topic->[2];
 		if (scalar(@$topic) == 3) { # parent was a ghost
@@ -910,7 +910,7 @@ sub acc_topic {
 
 sub dump_topics {
 	my ($ctx) = @_;
-	my $order = delete $ctx->{order}; # [ ts, subj1, subj2, subj3, ... ]
+	my $order = delete $ctx->{order}; # [ ds, subj1, subj2, subj3, ... ]
 	if (!@$order) {
 		$ctx->{-html_tip} = '<pre>[No topics in range]</pre>';
 		return 404;
@@ -923,14 +923,14 @@ sub dump_topics {
 
 	# sort by recency, this allows new posts to "bump" old topics...
 	foreach my $topic (sort { $b->[0] <=> $a->[0] } @$order) {
-		my ($ts, $n, $seen, $top, @ex) = @$topic;
+		my ($ds, $n, $seen, $top, @ex) = @$topic;
 		@$topic = ();
 		next unless defined $top;  # ghost topic
 		my $mid = delete $seen->{$top};
 		my $href = mid_escape($mid);
 		my $prev_subj = [ split(/ /, $top) ];
 		$top = PublicInbox::Hval->new($top)->as_html;
-		$ts = fmt_ts($ts);
+		$ds = fmt_ts($ds);
 
 		# $n isn't the total number of posts on the topic,
 		# just the number of posts in the current results window
@@ -946,7 +946,7 @@ sub dump_topics {
 		my $mbox = qq(<a\nhref="$href/t.mbox.gz">mbox.gz</a>);
 		my $atom = qq(<a\nhref="$href/t.atom">Atom</a>);
 		my $s = "<a\nhref=\"$href/T/$anchor\"><b>$top</b></a>\n" .
-			" $ts UTC $n - $mbox / $atom\n";
+			" $ds UTC $n - $mbox / $atom\n";
 		for (my $i = 0; $i < scalar(@ex); $i += 2) {
 			my $level = $ex[$i];
 			my $subj = $ex[$i + 1];
-- 
EW


  parent reply	other threads:[~2018-03-22  9:40 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-03-22  9:40 [PATCH 00/13] reindexing, feeds, date fixes Eric Wong (Contractor, The Linux Foundation)
2018-03-22  9:40 ` [PATCH 01/13] content_id: do not take Message-Id into account Eric Wong (Contractor, The Linux Foundation)
2018-03-22  9:40 ` [PATCH 02/13] introduce InboxWritable class Eric Wong (Contractor, The Linux Foundation)
2018-03-22  9:40 ` [PATCH 03/13] import: discard all the same headers as MDA Eric Wong (Contractor, The Linux Foundation)
2018-03-22  9:40 ` [PATCH 04/13] InboxWritable: add mbox/maildir parsing + import logic Eric Wong (Contractor, The Linux Foundation)
2018-03-22  9:40 ` Eric Wong (Contractor, The Linux Foundation) [this message]
2018-03-22  9:40 ` [PATCH 06/13] msgmap: add tmp_clone to create an anonymous copy Eric Wong (Contractor, The Linux Foundation)
2018-03-22  9:40 ` [PATCH 07/13] fix syntax warnings Eric Wong (Contractor, The Linux Foundation)
2018-03-22  9:40 ` [PATCH 08/13] v2writable: support reindexing Xapian Eric Wong (Contractor, The Linux Foundation)
2018-03-26 20:08   ` Eric Wong
2018-03-22  9:40 ` [PATCH 09/13] t/altid.t: extra tests for mid_set Eric Wong (Contractor, The Linux Foundation)
2018-03-22  9:40 ` [PATCH 10/13] v2writable: add NNTP article number regeneration support Eric Wong (Contractor, The Linux Foundation)
2018-03-22  9:40 ` [PATCH 11/13] v2writable: clarify header cleanups Eric Wong (Contractor, The Linux Foundation)
2018-03-22  9:40 ` [PATCH 12/13] v2writable: DEBUG_DIFF respects $TMPDIR Eric Wong (Contractor, The Linux Foundation)
2018-03-22  9:40 ` [PATCH 13/13] feed: $INBOX/new.atom endpoint supports v2 inboxes Eric Wong (Contractor, The Linux Foundation)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180322094015.14422-6-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).