#!/usr/local/bin/perl
#
# new.pl - create an index of new or changed web files
#
# Usage: new.pl directory "title" [file.html]
#
# Finds any new or changed files in the indicated directory
# and lists them in file.html. If the filename is omitted,
# the last component of the directory path will be used.
#
# run every night between midnight and 1 am to merge the previous
# days changes into the index. Keeps the last 7 days of changes.
#
# Notes: - Files with 'test' in their path or
are not indexed
# (so test documents don't show up in the "what's new" listing).
# - Files with .htaccess in any parent directory are not indexed
# (since most people probably can't access them anyway).
# - Also skips anything disallowed in robots.txt file,
# anything in the @ignore array, and anything specified
# with -ignore= on the commandline.
#
# Earl Fogel, April 1996
# things you should configure
$DocumentRoot = "/usr/local/etc/httpd/htdocs";
$ServerURL = "http://www.usask.ca";
$ChangeDir = "$DocumentRoot/changes";
# you shouldn't need to change anything below this line
# Here are some things to ignore.
# These can either be filenames (in which case they are ignored
# in every directory in which they occur) or paths (containing a slash)
# (in which case every file beginning with this path is ignored).
@ignore = ($ChangeDir);
# use -ignore=xxx to skip files named 'xxx'.
while ($_ = $ARGV[0], /^-/) {
shift;
last if /^--$/;
/^-ignore=(.*)/ && push(@ignore,"$1");
}
if ($#ARGV == 1) {
$directory = $ARGV[0];
$title = $ARGV[1];
$directory =~ m#/([^/]+)/?$#;
$outfile = "$ChangeDir/$1.html";
} elsif ($#ARGV == 2) {
$directory = $ARGV[0];
$title = $ARGV[1];
$outfile = "$ChangeDir/$ARGV[2]";
} else {
print STDERR "Usage: new.pl directory title [file.html]\n";
exit(-1);
}
$firsttime=1;
@months = (
"January", "February", "March", "April", "May", "June",
"July", "August", "September", "October", "November", "December");
@days = (
"Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday");
($sec,$min,$hour,$mday,$mon,$year,$wday) = localtime(time-3600);
$date = "$days[$wday] $months[$mon] $mday, 19$year";
open(FNAMES,"find $directory -name '*.html' -mtime -1 -print |")
|| die("Oops\n");
# ignore anything disallowed in robots.txt
if (open(ROBOTS,"$DocumentRoot/robots.txt")) {
while () {
if (/^Disallow/) {
($junk, $ignoreurl) = split;
push(@ignore,"$DocumentRoot$ignoreurl");
}
}
close(ROBOTS);
}
newfile:
while () {
chop;
next if $_ eq "$outfile";
next if /test/;
foreach $dir (@ignore) {
next newfile if $_ =~ "^$dir";
next newfile if ($_ =~ "/$dir\$" && $dir !~ /\//);
}
# skip it if any ancestor directory has a .htaccess file
$dirtest = $_;
while ($dirtest =~ "/") {
$dirtest =~ s#/[^/]*$##;
next newfile if -e "$dirtest/.htaccess";
}
$fname = $_;
($url = $fname) =~ s/$DocumentRoot/$ServerURL/e;
$desc = $url; # use the URL if we can't find a title below
open(FILE,"<$fname") || print STDERR "can't read $fname\n";
$size=read(FILE,$_,32768,0);
if ($size>20 && //i) {
s/\n/ /g;
s/.*//i;
s/<\/title>.*//i;
s/<[^>]*>//g;
s/^ *//;
s/ *$//;
$desc = $_ if /\w/; # has at least one alpha-numeric character
}
close(FILE);
next if ($desc =~ /\btest\b/i); # skip test pages
if (defined($firsttime)) {
undef($firsttime);
rename($outfile, $outfile . '~');
open(INDEX, ">$outfile") || die("can't write $outfile\n");
print INDEX <<"EOF";
$title new and changed files
$title new and changed files
$date
EOF
}
print INDEX <<"EOF";
- $desc
EOF
}
close(FNAMES);
if (defined($firsttime)) { # nothing happened
exit;
}
print INDEX <<"EOF";
EOF
if (open(OLDINDEX, "$outfile~")) {
# skip file header
while() {
last if (/^/);
}
$count++;
print INDEX $_;
# copy the first 6 sections
while() {
$count++ if (/^/);
last if ($count>=6 || /^<.BODY>/);
print INDEX $_;
}
}
close(OLDINDEX);
print INDEX <<"EOF";
EOF