From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from localhost (localhost [127.0.0.1]) by olra.theworths.org (Postfix) with ESMTP id 58603431FBD for ; Wed, 1 Aug 2012 01:10:06 -0700 (PDT) X-Virus-Scanned: Debian amavisd-new at olra.theworths.org X-Spam-Flag: NO X-Spam-Score: 0 X-Spam-Level: X-Spam-Status: No, score=0 tagged_above=-999 required=5 tests=[none] autolearn=disabled Received: from olra.theworths.org ([127.0.0.1]) by localhost (olra.theworths.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id aKIf-ipc8p2b for ; Wed, 1 Aug 2012 01:10:05 -0700 (PDT) Received: from upsilon.hackadomia.org (upsilon.hackadomia.org [91.121.245.170]) by olra.theworths.org (Postfix) with ESMTP id 1A80B431FBC for ; Wed, 1 Aug 2012 01:10:05 -0700 (PDT) Received: from usha.takhisis.invalid (unknown [151.59.176.148]) by upsilon.hackadomia.org (Postfix) with ESMTPSA id 151B496024; Wed, 1 Aug 2012 10:10:04 +0200 (CEST) Received: by usha.takhisis.invalid (Postfix, from userid 1000) id 0BF9E683154; Wed, 1 Aug 2012 10:10:03 +0200 (CEST) From: Stefano Zacchiroli To: notmuch@notmuchmail.org Subject: [PATCH 1/2] Add duplicate message removal for notmuch-mutt. Date: Wed, 1 Aug 2012 10:09:41 +0200 Message-Id: <1343808582-9519-2-git-send-email-zack@upsilon.cc> X-Mailer: git-send-email 1.7.10.4 In-Reply-To: <1343808582-9519-1-git-send-email-zack@upsilon.cc> References: <1343808582-9519-1-git-send-email-zack@upsilon.cc> Cc: "Kevin J. McCarthy" , Stefano Zacchiroli X-BeenThere: notmuch@notmuchmail.org X-Mailman-Version: 2.1.13 Precedence: list List-Id: "Use and development of the notmuch mail system." List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 01 Aug 2012 08:10:06 -0000 From: Kevin McCarthy Add a --remove-dups flag which removes duplicate files from search and thread results. Uses fdupes if installed. Otherwise it runs a size and Digest::SHA scan on each file to detect duplicates. Signed-off-by: Stefano Zacchiroli --- contrib/notmuch-mutt/notmuch-mutt | 89 ++++++++++++++++++++++++++++------ contrib/notmuch-mutt/notmuch-mutt.rc | 4 +- 2 files changed, 76 insertions(+), 17 deletions(-) diff --git a/contrib/notmuch-mutt/notmuch-mutt b/contrib/notmuch-mutt/notmuch-mutt index 7c125e6..d14709d 100755 --- a/contrib/notmuch-mutt/notmuch-mutt +++ b/contrib/notmuch-mutt/notmuch-mutt @@ -18,6 +18,8 @@ use Mail::Box::Maildir; use Pod::Usage; use String::ShellQuote; use Term::ReadLine; +use Digest::SHA; +use File::Which; my $xdg_cache_dir = "$ENV{HOME}/.cache"; @@ -34,16 +36,65 @@ sub empty_maildir($) { $folder->close(); } -# search($maildir, $query) +# Match files by size and SHA-256; then delete duplicates +sub builtin_remove_dups($) { + my ($maildir) = @_; + my (%size_to_files, %sha_to_files); + + # Group files by matching sizes + foreach my $file (glob("$maildir/cur/*")) { + my $size = -s $file; + push(@{$size_to_files{$size}}, $file) if $size; + } + + foreach my $same_size_files (values %size_to_files) { + # Don't run sha unless there is another file of the same size + next if scalar(@$same_size_files) < 2; + %sha_to_files = (); + + # Group files with matching sizes by SHA-256 + foreach my $file (@$same_size_files) { + open(my $fh, '<', $file) or next; + binmode($fh); + my $sha256hash = Digest::SHA->new(256)->addfile($fh)->hexdigest; + close($fh); + + push(@{$sha_to_files{$sha256hash}}, $file); + } + + # Remove duplicates + foreach my $same_sha_files (values %sha_to_files) { + next if scalar(@$same_sha_files) < 2; + unlink(@{$same_sha_files}[1..$#$same_sha_files]); + } + } +} + +# Use either fdupes or the built-in scanner to detect and remove duplicate +# search results in the maildir +sub remove_duplicates($) { + my ($maildir) = @_; + + my $fdupes = which("fdupes"); + if ($fdupes) { + system("$fdupes --hardlinks --symlinks --delete --noprompt" + . " --quiet $maildir/cur/ > /dev/null"); + } else { + builtin_remove_dups($maildir); + } +} + +# search($maildir, $remove_dups, $query) # search mails according to $query with notmuch; store results in $maildir -sub search($$) { - my ($maildir, $query) = @_; +sub search($$$) { + my ($maildir, $remove_dups, $query) = @_; $query = shell_quote($query); empty_maildir($maildir); system("notmuch search --output=files $query" . " | sed -e 's: :\\\\ :g'" . " | xargs --no-run-if-empty ln -s -t $maildir/cur/"); + remove_duplicates($maildir) if ($remove_dups); } sub prompt($$) { @@ -74,28 +125,28 @@ sub get_message_id() { return $1; } -sub search_action($$@) { - my ($interactive, $results_dir, @params) = @_; +sub search_action($$$@) { + my ($interactive, $results_dir, $remove_dups, @params) = @_; if (! $interactive) { - search($results_dir, join(' ', @params)); + search($results_dir, $remove_dups, join(' ', @params)); } else { my $query = prompt("search ('?' for man): ", join(' ', @params)); if ($query ne "") { - search($results_dir,$query); + search($results_dir, $remove_dups, $query); } } } -sub thread_action(@) { - my ($results_dir, @params) = @_; +sub thread_action($$@) { + my ($results_dir, $remove_dups, @params) = @_; my $mid = get_message_id(); my $search_cmd = 'notmuch search --output=threads ' . shell_quote("id:$mid"); my $tid = `$search_cmd`; # get thread id chomp($tid); - search($results_dir, $tid); + search($results_dir, $remove_dups, $tid); } sub tag_action(@) { @@ -118,11 +169,13 @@ sub main() { my $results_dir = "$cache_dir/results"; my $interactive = 0; my $help_needed = 0; + my $remove_dups = 0; my $getopt = GetOptions( "h|help" => \$help_needed, "o|output-dir=s" => \$results_dir, - "p|prompt" => \$interactive); + "p|prompt" => \$interactive, + "r|remove-dups" => \$remove_dups); if (! $getopt || $#ARGV < 0) { die_usage() }; my ($action, @params) = ($ARGV[0], @ARGV[1..$#ARGV]); @@ -136,9 +189,9 @@ sub main() { print STDERR "Error: no search term provided\n\n"; die_usage(); } elsif ($action eq "search") { - search_action($interactive, $results_dir, @params); + search_action($interactive, $results_dir, $remove_dups, @params); } elsif ($action eq "thread") { - thread_action($results_dir, @params); + thread_action($results_dir, $remove_dups, @params); } elsif ($action eq "tag") { tag_action(@params); } else { @@ -189,6 +242,12 @@ be overwritten. (Default: F<~/.cache/notmuch/mutt/results/>) Instead of using command line search terms, prompt the user for them (only for "search"). +=item -r + +=item --remove-dups + +Remove duplicates from search results. + =item -h =item --help @@ -205,10 +264,10 @@ the following in your Mutt configuration (usually one of: F<~/.muttrc>, F, or a configuration snippet under F): macro index \ - "unset wait_keynotmuch-mutt --prompt search~/.cache/notmuch/mutt/results" \ + "unset wait_keynotmuch-mutt -r --prompt search~/.cache/notmuch/mutt/results" \ "notmuch: search mail" macro index \ - "unset wait_keynotmuch-mutt thread~/.cache/notmuch/mutt/resultsset wait_key" \ + "unset wait_keynotmuch-mutt -r thread~/.cache/notmuch/mutt/resultsset wait_key" \ "notmuch: reconstruct thread" macro index \ "unset wait_keynotmuch-mutt tag -- -inbox" \ diff --git a/contrib/notmuch-mutt/notmuch-mutt.rc b/contrib/notmuch-mutt/notmuch-mutt.rc index b0a38d1..ddc4b48 100644 --- a/contrib/notmuch-mutt/notmuch-mutt.rc +++ b/contrib/notmuch-mutt/notmuch-mutt.rc @@ -1,8 +1,8 @@ macro index \ - "unset wait_keynotmuch-mutt --prompt search`echo ${XDG_CACHE_HOME:-$HOME/.cache}/notmuch/mutt/results`" \ + "unset wait_keynotmuch-mutt -r --prompt search`echo ${XDG_CACHE_HOME:-$HOME/.cache}/notmuch/mutt/results`" \ "notmuch: search mail" macro index \ - "unset wait_keynotmuch-mutt thread`echo ${XDG_CACHE_HOME:-$HOME/.cache}/notmuch/mutt/results`set wait_key" \ + "unset wait_keynotmuch-mutt -r thread`echo ${XDG_CACHE_HOME:-$HOME/.cache}/notmuch/mutt/results`set wait_key" \ "notmuch: reconstruct thread" macro index \ "unset wait_keynotmuch-mutt tag -- -inbox" \ -- 1.7.10.4