#!/usr/bin/perl -w
use strict;
use Data::Dumper;
use Mail::Util ();
use Mail::Header ();
use constant DEBUG => 0;
use constant TOP_TESTS => 30;

my $STATS = {
     autolearn => { spam => 0, ham => 0 },
    checked_by => {},
         count => 0,
          hits => { min=>1000, max=>-1000 },
         msgid => { min=>'', max=>'' },
       is_spam => { Yes => 0, No => 0 },
          miss => 0,
         tests => {},
};

sub current_spambox {
    my ($mon, $year) = (localtime)[4,5];
    return sprintf "$ENV{HOME}/.PTTR/spam-%04d%02d", $year+1900, $mon+1;
}

sub show_count {
    my $cnt = sprintf "%06d", $STATS->{count};
    $cnt =~ s/(?:^| )0/ /g;
    print "\b"x6, $cnt;
}

my $SCORE_SET = 3;
my @SCORE_FILES = (
		'/usr/share/spamassassin/50_scores.cf',
		'/etc/mail/spamassassin/local.cf',
		"$ENV{HOME}/.spamassassin/user_prefs"
	);
my $SCORES = ();

sub get_scores {
	foreach (@SCORE_FILES) {
		open(my $fh, $_) or next;
		while(<$fh>) {
			next unless /^score\s+(\S+)\s+(.+?)\s*$/;
			my ($rule, $scores) = ($1, $2);
			my @sc = split /\s+/, $scores;
			next unless @sc;
			$SCORES->{$rule} = $sc[$SCORE_SET] || $sc[0];
		}
		close $fh;
	}
}

sub spam_stats {
  my $INBOX = shift || return;
  return unless -f $INBOX;
  my $hd = Mail::Header->new;
  print "Using $INBOX (" . (-s $INBOX) . ' bytes) ... ' . (' 'x6);
  foreach (Mail::Util::read_mbox($INBOX)) {
    $STATS->{count}++; show_count;

    $hd->empty;
    $hd->extract($_);
    $hd->unfold('X-Spam-Status');
    $hd->unfold('X-Spam-Check-By');
    $hd->unfold('X-Spam-Checker-Version');

    my $msgid = $hd->get('Message-ID');
    my $ckdby = $hd->get('X-Spam-Check-By')
    				|| $hd->get('X-Spam-Checker-Version')
    				|| 'none';
	for ($ckdby, $msgid) {
		s/\s+$//;
		s/^\s+//;
		s/^.+\s+//;
	}

    local $_ = $hd->get('X-Spam-Status');
    unless ($_) { # message was NOT checked by SpamAssassin!
        $STATS->{miss}++;
        next;
    } else {
        s/,\s+/,/g;
        /^(yes|no),/i and $STATS->{is_spam}{$1}++;
        /\bversion=(\S+)/ and do {
        	my $sa_version = $1;
        	$STATS->{checked_by}{$ckdby}{$sa_version}{messages}++;
	        /\brequired=(\S+)/
	        	and $STATS->{checked_by}{$ckdby}{$sa_version}{reqhits} = $1;
        };	
        /\bautolearn=(\S+)/ and $STATS->{autolearn}{$1}++;
        /\bhits=(\S+)/ and do {
            ($STATS->{hits}{min}, $STATS->{msgid}{min}) = ($1, $msgid)
            									if $1 < $STATS->{hits}{min};
            ($STATS->{hits}{max}, $STATS->{msgid}{max}) = ($1, $msgid)
            									if $1 > $STATS->{hits}{max};
        };
        /tests=(\S+)/ and do {
            $STATS->{tests}{$_}++ foreach split /,/, $1;
        };
    }
  }
  print $/;
  print Dumper($STATS) if DEBUG;
}

$| = 1;
get_scores;
my @INBOX = @ARGV ? @ARGV : (current_spambox);
spam_stats($_) for @INBOX;

### REPORT!
my $checked = $STATS->{count} - $STATS->{miss};
print <<REPORT;
SpamAssassin checked $checked messages from a total of $STATS->{count} messages.
It claimed that $STATS->{is_spam}{Yes} are spam and $STATS->{is_spam}{No} are ham.
In the process it auto-learnt from $STATS->{autolearn}{spam} spam and $STATS->{autolearn}{ham} ham messages.
Record hits were:
	maximum: $STATS->{hits}{max} ($STATS->{msgid}{max})
	minimum: $STATS->{hits}{min} ($STATS->{msgid}{min})
REPORT

print((TOP_TESTS ? 'Top ' . TOP_TESTS : 'All'), ' matching tests:', $/);
my $st = TOP_TESTS || scalar keys %{$STATS->{tests}};
foreach (sort {$STATS->{tests}{$b} <=> $STATS->{tests}{$a}}
			keys %{$STATS->{tests}}) {
	last if $st-- <= 0;
	printf "%6d\t%s (%s)\n", $STATS->{tests}{$_}, $_, $SCORES->{$_};
}

print 'Checks were done by:', $/;
foreach my $host (sort keys %{$STATS->{checked_by}}) {
	if (scalar keys %{$STATS->{checked_by}{$host}} == 1) {
		my ($ver, $nfo) = %{$STATS->{checked_by}{$host}};
		print "\t$host: $nfo->{messages} messages,",
				" (version $ver, required_hits: $nfo->{reqhits})\n";
	} else {
		print "\t$host:\n";
		while (my ($ver,$nfo) = each %{$STATS->{checked_by}{$host}}) {
			print "\t\t$nfo->{messages} messages,",
					" (version $ver, required_hits: $nfo->{reqhits})\n";
		}
	}
}