#!/usr/bin/perl # Keywords mesh # 2007020201 altblue@n0i.net # $Id$ use strict; use warnings; our $VERSION = '0.5.1'; use English qw( -no_match_vars ); use Algorithm::FastPermute qw(permute); use Carp; use Getopt::Long qw(:config no_ignore_case bundling auto_version auto_help); # in memory cache my ( @keywords, @urls, @results, @missed ); my %opt = ( marker => qr{^x+$}, # how are "KW block markers" defined? not_found => 'NOTFOUND', # when a coresponding is not found, # how to mark that keyword? verbose => 0, # verbosity level max_words => 8, # words limit (bailout out on overflow) ); Getopt::Long::GetOptions( \%opt, qw[ not_found|n=s verbose|v+ max_words|w=i ] ); # keywords file, keyword-url file my ( $kw_file, $url_file ) = @ARGV; load_keywords($kw_file); load_urls($url_file); collect_results(); report(); print_results(); exit 0; ### support functions # add keyword & URL to results set sub add_result { my ( $ks, $url ) = @_; my $name = join q{ }, @{ $ks->[2] }; $results[ $ks->[0] ] = join "\t", $name, $url || $opt{not_found}; return $name; } # iterate through keywords, finding URLs sub collect_results { my $kidx = 1; # keyword index my $kwnum = scalar @keywords; for my $ks (@keywords) { if ( @{ $ks->[2] } > $opt{max_words} ) { debug( 1, "Max words limit reached ($opt{max_words})!" ); last; } my $kw = join q{ }, @{ $ks->[2] }; my ( $line, $sample ) = @{$ks}; my @words = unique_words( @{ $ks->[2] } ); my $variants = fact( scalar @words ); debug( 2, "($kidx of $kwnum) $kw ($variants variants) [URLs: " . scalar @urls . q{]} ); my $found = 0; # keyword match switch if ( $variants < 7 ) { # play memory safe here too my @rex; permute { my $quoted = join q{\s+}, map { quotemeta } @words; push @rex, qr{^(?i:$quoted)\t+(\S+)}; } @words; $found = find_url( $ks, 0, @rex ); } else { # walk sideways (iterating through URLs for each variant) my $ridx = 0; # variant index eval { # ugly hack (jumping outside a pseudoblock) permute { my $quoted = join q{\s+}, map { quotemeta } @words; $found = find_url( $ks, $ridx++, qr{^(?i:$quoted)\t+(\S+)} ); croak if $found; } @words; }; } if ( !$found ) { push @missed, $kw; add_result($ks); } $kidx++; } if ( $kidx < @keywords ) { push @missed, map { add_result($_) } @keywords[ $kidx - 1 .. $#keywords ]; } return; } # dump lines to STDERR if verbosity threshold is met sub debug { my $level = shift; return if $opt{verbose} < $level; return print {*STDERR} q{#} x $level . " @_\n"; } # factorial: used to determine the number of variants sub fact { my $n = 1; for ( 1 .. shift ) { $n *= $_; } return $n; } # match Keyword variant(s) against the list of remaining URLs # $sridx = Starting Variant index (used for debugging) sub find_url { my ( $ks, $sridx, @rex ) = @_; my $uidx = 0; # URL index for my $uline (@urls) { debug( 3, "URL line: $uline" ); my $ridx = $sridx || 0; for my $re (@rex) { $ridx++; debug( 5, "[$ridx] match against $re" ); if ( $uline =~ /\b\Q$ks->[1]\E\b/xm && $uline =~ $re ) { debug( 2, "[$ridx] MATCHED" ); add_result( $ks, $1 ); splice @urls, $uidx, 1; # drop URL on successful match return 1; } } $uidx++; } return; # no match } # store the entire KW file into memory # in a [ line_number, sample_word, @words ] structure # = sample_word: keyword's longest word (micro-optimization) sub load_keywords { my $file = shift; open my $fh, q{<}, $file or croak "Cannot open $file: $OS_ERROR"; my $idx = 0; while ( my $line = <$fh> ) { chomp $line; for ($line) { s/^\s+//xms; s/\s+$//xms; s/\s+/ /xms; } next if $line !~ /\S/xm; if ( $line =~ $opt{marker} ) { # markers go "as is" $results[ $idx++ ] = $line; next; } my @words = split /\s/xm, $line; my ($sample) = reverse sort { length $a <=> length $b } @words; push @keywords, [ $idx++, $sample, \@words ]; } close $fh or croak "Cannot close $kw_file: $OS_ERROR"; # sort keywords in a somehow anticipated complexity order @keywords = sort { scalar @{ $a->[2] } <=> scalar @{ $b->[2] } || length $a->[1] <=> length $b->[1] } @keywords; return scalar @keywords; } # store the entire KW=URL file into memory sub load_urls { my $file = shift; open my $fh, q{<}, $file or croak "Cannot open $file: $OS_ERROR"; while ( my $line = <$fh> ) { chomp $line; next if $line !~ /\S/xm; push @urls, $line; } close $fh or croak "Cannot close $kw_file: $OS_ERROR"; return scalar @urls; } # print results to STDOUT sub print_results { for (@results) { print $_, "\n"; } return; } # report failures sub report { return if $opt{verbose} < 3; if (@missed) { debug( 3, 'These Keywords have no coresponding URL:' ); for (@missed) { debug( 3, $_ ); } } if (@urls) { debug 3, 'These URLs have no coresponding Keyword:'; for (@urls) { debug( 3, $_ ); } } return; } # keywords may contain duplicate words, # avoid them when matching against KW=URL list sub unique_words { my %h = map { lc $_ => 1 } @_; return keys %h; }