#!/usr/bin/perl # # List most frequently used links, based on referrer # information from the apache combined log file. # # Usage: linkstats.pl [options] logfile # # where options are: # # -a Include images, stylesheets, javascript # -errors Only report on errors # -redirects Only report on redirects # -refsite= Only report on referrers from this site/area # -refurl= Only report on referrers from this url # -site= Only report on this area (starts with match) # -url= Only report on this url (exact match) # # eg. linkstats.pl -refurl=http://www.lights.com/ access_log # linkstats.pl -site=/publisher/ /var/apache/log/access_log # # Earl Fogel, February 2000 $mincount = 10; # ignore errors which occur fewer times than this $minpercent = 0.05; # ignore links which account for less than this # percent of the total # check for command-line options while ($_ = $ARGV[0], /^-/) { shift; /-a/ && ($all++); /-errors/ && ($errors++); /-redir/ && ($redir++); /-status=(.*)/ && ($status=$1); /-refsite=(.*)/ && (push @refsites,$1); /-refurl=(.*)/ && (push @refurls,$1); /-site=(.*)/ && (push @sites,$1); /-url=(.*)/ && (push @urls,$1); last if /^--$/; } LINE: while (<>) { if (m/^\S+ \S+ .*? \[\d\d\/[a-z]+\S+ \S+\] "\S+ (.*?) ?\S*" (\d+) .*? "(.*?)"/i || m/^\S+ \S+ .*? \[\d\d\/[a-z]+\S+ \S+\] "(-?)" (\d+) .*? "(.*?)"/i) { next if defined $status and $2 != $status; # only this status next if $2 < 400 && $errors; # only report errors next if ($2 < 301 || $2 > 303) && $redir; # only report redirects next if $3 eq '-'; # no referrer info $request = $1; $code = $2; $refer = $3; my $wanted = 0; $wanted = 1 if @refsites==0 && @refurls==0 && @sites==0 && @urls==0; next if !$all && $request =~ m#\.(gif|jpg|jpeg|css|js)$#i; if (!$wanted) { foreach my $url (@urls) { $wanted++ if $request =~ m#^$url$#o; # only this url } } if (!$wanted) { foreach my $site (@sites) { $wanted++ if $request =~ m#^$site#o; # only this site } } if (!$wanted) { foreach my $refurl (@refurls) { $wanted++ if $refer =~ m#^$refurl$#o; # only this referring url } } if (!$wanted) { foreach my $refsite (@refsites) { $wanted++ if $refer =~ m#^$refsite#o; # only this ref. site } } next LINE unless $wanted; $request =~ s/"$//; $request =~ s/\?.*//; $request =~ s/\#.*//; $refer =~ s/\?.*//; $refer =~ s/\#.*//; if ($code == 404) { $bad{"$refer => $request"}++; } elsif ($code == 301 || $code == 302 || $code == 303) { $redir{"$refer => $request"}++; } elsif ($code < 400) { $good{"$refer => $request"}++; } else { $error{"(status $code) $refer => $request"}++; } # } else { # warn "Parse Error in: $_"; } } &DisplayTop(*good,"Top Links"); &DisplayTop(*bad,"Broken Links"); &DisplayTop(*redir,"Redirected"); &DisplayTop(*error,"Other Errors"); sub DisplayTop { local(*hash,$heading) = @_; my($n,$limit,$msg,$total); if (defined(%hash)) { $n = keys(%hash); foreach $msg (keys(%hash)) { $total += $hash{$msg}; delete $hash{$msg} if $hash{$msg} < $mincount; } foreach $msg (keys(%hash)) { delete $hash{$msg} if $hash{$msg} < $total * ($minpercent / 100); } $limit = keys(%hash); return unless $limit; print "\n$heading (displaying top $limit of $n lines):\n"; foreach $msg (sort {$hash{$b}-$hash{$a}} keys(%hash)) { printf("%5d %s\n",$hash{$msg},$msg); } } }