#!/usr/local/bin/perl # # new.pl - create an index of new or changed web files # # Usage: new.pl directory "title" [file.html] # # Finds any new or changed files in the indicated directory # and lists them in file.html. If the filename is omitted, # the last component of the directory path will be used. # # run every night between midnight and 1 am to merge the previous # days changes into the index. Keeps the last 7 days of changes. # # Notes: - Files with 'test' in their path or are not indexed # (so test documents don't show up in the "what's new" listing). # - Files with .htaccess in any parent directory are not indexed # (since most people probably can't access them anyway). # - Also skips anything disallowed in robots.txt file, # anything in the @ignore array, and anything specified # with -ignore= on the commandline. # # Earl Fogel, April 1996 # things you should configure $DocumentRoot = "/usr/local/etc/httpd/htdocs"; $ServerURL = "http://www.usask.ca"; $ChangeDir = "$DocumentRoot/changes"; # you shouldn't need to change anything below this line # Here are some things to ignore. # These can either be filenames (in which case they are ignored # in every directory in which they occur) or paths (containing a slash) # (in which case every file beginning with this path is ignored). @ignore = ($ChangeDir); # use -ignore=xxx to skip files named 'xxx'. while ($_ = $ARGV[0], /^-/) { shift; last if /^--$/; /^-ignore=(.*)/ && push(@ignore,"$1"); } if ($#ARGV == 1) { $directory = $ARGV[0]; $title = $ARGV[1]; $directory =~ m#/([^/]+)/?$#; $outfile = "$ChangeDir/$1.html"; } elsif ($#ARGV == 2) { $directory = $ARGV[0]; $title = $ARGV[1]; $outfile = "$ChangeDir/$ARGV[2]"; } else { print STDERR "Usage: new.pl directory title [file.html]\n"; exit(-1); } $firsttime=1; @months = ( "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"); @days = ( "Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"); ($sec,$min,$hour,$mday,$mon,$year,$wday) = localtime(time-3600); $date = "$days[$wday] $months[$mon] $mday, 19$year"; open(FNAMES,"find $directory -name '*.html' -mtime -1 -print |") || die("Oops\n"); # ignore anything disallowed in robots.txt if (open(ROBOTS,"$DocumentRoot/robots.txt")) { while (<ROBOTS>) { if (/^Disallow/) { ($junk, $ignoreurl) = split; push(@ignore,"$DocumentRoot$ignoreurl"); } } close(ROBOTS); } newfile: while (<FNAMES>) { chop; next if $_ eq "$outfile"; next if /test/; foreach $dir (@ignore) { next newfile if $_ =~ "^$dir"; next newfile if ($_ =~ "/$dir\$" && $dir !~ /\//); } # skip it if any ancestor directory has a .htaccess file $dirtest = $_; while ($dirtest =~ "/") { $dirtest =~ s#/[^/]*$##; next newfile if -e "$dirtest/.htaccess"; } $fname = $_; ($url = $fname) =~ s/$DocumentRoot/$ServerURL/e; $desc = $url; # use the URL if we can't find a title below open(FILE,"<$fname") || print STDERR "can't read $fname\n"; $size=read(FILE,$_,32768,0); if ($size>20 && /<title>/i) { s/\n/ /g; s/.*<title>//i; s/<\/title>.*//i; s/<[^>]*>//g; s/^ *//; s/ *$//; $desc = $_ if /\w/; # has at least one alpha-numeric character } close(FILE); next if ($desc =~ /\btest\b/i); # skip test pages if (defined($firsttime)) { undef($firsttime); rename($outfile, $outfile . '~'); open(INDEX, ">$outfile") || die("can't write $outfile\n"); print INDEX <<"EOF"; <HTML> <HEAD> <TITLE>$title new and changed files

$title new and changed files

$date

EOF if (open(OLDINDEX, "$outfile~")) { # skip file header while() { last if (/^

/); } $count++; print INDEX $_; # copy the first 6 sections while() { $count++ if (/^

/); last if ($count>=6 || /^<.BODY>/); print INDEX $_; } } close(OLDINDEX); print INDEX <<"EOF"; EOF