#!/usr/bin/perl
###############################################################################
#
# WHAT?
#   Processes "spamd" logs and prints a summary report
#
# HOW?
#   spamd-stats [-t <max_top_tests>] [logfile ...]
#
#   Options:
#     -t 0 = disable showing top tests
#     -t 1 = disable limitation, show ALL tests
#
# WHO?
#   Marius Feraru <altblue@n0i.net>
#
###############################################################################
use strict;
use warnings;
our $VERSION = sprintf '0.%d.%d', '\$Revision: 1.0 $' =~ /(\d+)\.(\d+)/xm;
use English qw( -no_match_vars );
use Getopt::Long qw(:config no_ignore_case bundling auto_version auto_help);
use List::Util qw(max min);
use Data::Dumper;

# default options...
my %opt = ( top_tests => 20 );

# wanna override them?
Getopt::Long::GetOptions( \%opt, qw[ top_tests|t=i ], );

# log files to scan...
my @log_files = @ARGV;

# default.... use default "maillog"
if ( !@log_files ) {
    push @log_files, '/var/log/maillog';
}

# STATS structure
my $STATS = {
    total  => 0,    # number of processed messages
    spam   => 0,    # number of spam messages
    scores => {     # scores collection
        spam => [],
        ham  => [],
    },
    tests => {},    # stats by test
    learn => {},    # autolearn
};

foreach my $logfile (@log_files) {
    process_log($logfile);
}

# Report:
print 'Processed ', $STATS->{total}, ' messages, ',
    valp( $STATS->{spam}, $STATS->{total} ), ' spam.', "\n", 'Learned from: ',
    valp( $STATS->{learn}{ham}, $STATS->{total} ), ' ham, ',
    valp( $STATS->{learn}{spam}, $STATS->{total} ), ' spam.', "\n",
    'Spam Scores:  max: ', max( @{ $STATS->{scores}{spam} } ), ', min: ',
    min( @{ $STATS->{scores}{spam} } ), "\n", 'Ham Scores:  max: ',
    max( @{ $STATS->{scores}{ham} } ), ', min: ',
    min( @{ $STATS->{scores}{ham} } ), "\n",;
if ( $opt{top_tests} ) {
    if ( $opt{top_tests} > 0 ) {
        print 'Top ', $opt{top_tests}, q{ };
    }
    print "Tests hit:\n";
    my $idx = 1;
    for my $test (
        sort { $STATS->{tests}{$b} <=> $STATS->{tests}{$a} }
        keys %{ $STATS->{tests} }
        )
    {
        printf "%8d %s\n", $STATS->{tests}{$test}, $test;
        last if $opt{top_tests} > 0 && $idx++ >= $opt{top_tests};
    }
}

# Value with percent attached
sub valp {
    my ( $value, $total ) = @_;
    return sprintf '%d (%.2f%%)', $value, $total ? 100 * $value / $total : 0;
}

# decompressors: looks lazy, but it's FASTER than using any Compress::
# = RegExp is applied to file name
my %DECOMPRESSORS = (
    '/bin/zcat'      => qr/\.(?i:gz)$/,
    '/usr/bin/bzcat' => qr/\.(?i:bz2)$/,
);

# get a log filehandle...
sub get_fh {
    my $logfile = shift;
    while ( my ( $dec, $rex ) = each %DECOMPRESSORS ) {
        next if $logfile !~ $rex;
        open my $fh, q{-|}, $dec, $logfile
            or die "Cannot decompress $logfile with $dec: $OS_ERROR\n";
        return $fh;
    }
    open my $fh, q{<}, $logfile or die "Cannot open $logfile: $OS_ERROR\n";
    return $fh;
}

# Log file processor...
sub process_log {
    my $fh = get_fh(@_);
    while ( my $line = <$fh> ) {
        next if $line !~ m{
            ^ .+?
            spamd: \s+ result: \s+
            (\S) \s+
            (-?\d+)
            \s+ - \s+
            (\S+) \s+
            (\S+)
        }xms;
        my ( $is_spam, $score, $flags, $stats ) = ( $1, $2, $3, $4 );
        $is_spam = $is_spam eq 'Y' ? 1 : 0;

        $STATS->{total}++;
        if ($is_spam) {
            $STATS->{spam}++;
            push @{ $STATS->{scores}{spam} }, $score;
        }
        else {
            push @{ $STATS->{scores}{ham} }, $score;
        }
        for my $flag ( split /,/xm, $flags ) {
            $STATS->{tests}{$flag}++;
        }
        if ( $stats =~ /autolearn=([^,]+)/xm ) {
            $STATS->{learn}{$1}++;
        }
    }
    close $fh;
    return;
}