use warnings; use strict;
use IO::Uncompress::Bunzip2 qw/$Bunzip2Error/;
if(@ARGV == 0)
{ die "Usage: count-L2-headers.pl DUMP_FILENAME > RESULTS.txt\n"; }
my $dump; if($ARGV =~ m/\.bz2$/) {
$dump = IO::Uncompress::Bunzip2->new($ARGV) or die "Could not open '$ARGV' as a .bz2 stream: $Bunzip2Error\n";
} else {
open $dump, '<', $ARGV or die "Could not open '$ARGV' for reading: $!\n"; my $lt; read $dump, $lt, 1; if($lt ne '<') { die "'$ARGV' does not begin with '<'.\n"; }
}
my %namespaces;
while(<$dump>) {
last if m/<\/namespaces>/;
if(m/<namespace\s*>(+)<\/namespace>/) { $namespaces{$1} = 1; }
}
my $title; my $in_text = ;
my %counts;
while(<$dump>) {
if($in_text) { if(s/<\/text>\s*$//) { $in_text = ; } if(m/^==(?!=.*===\s*$)\s*(.*\S)\s*==\s*$/) { my $L2 = $1; if($title !~ m/^(+):/ || ! $namespaces{$1}) { ++$counts{$L2}; } } } else { if(m/^\s*<title>(+)<\/title>\s*$/) { $title = $1; } elsif(s/^\s*<text xml:space="preserve">//) { $in_text = 1; redo; } }
}
print "\xEF\xBB\xBF\n"; foreach my $lang (sort keys %counts) {
print "* $lang \xE2\x80\x94 $counts{$lang}\n";
}
__END__ </source>