#!/usr/bin/perl # Note: requires Perl5 $version = "0.46"; # ============================================================== # W E B X R E F # ============================================================== # Webxref is a WWW link checker and cross referencing tool, intended # to quickly check a local set of HTML documents for missing files, # anchors etc. You simply call webxref with a HTML document as the # parameter, without any configuring. # Webxref compiles a list of HTML documents, URLs, name anchors, # images etc and the html files that reference those, i.e. a # cross-reference list. Hence the name: webxref. # # Webxref was originally written by Rick Jansen at SARA in 1995 & 1997 # email: rick@sara.nl # url: http://www.sara.nl/Rick.Jansen # # Since then, it has been substantially overhauled by: # # Earl Fogel # http://www.usask.ca/~fogel/ # # Feb 1998 - change -htmlonly to -forcehtml # - only include html files in fluff report # (unless using -forcehtml) # - can have both name= and href= # - read files directly into large string # - bug fix: only check external URLs once # Dec 1998 - added -user, -group options # - added -up option, fixes to ../ # Feb 1999 - fixed -depth, and http # - fixed fluff file & directory bugs # - now handles symbolic links to directories # - use \r\n\r\n to end HTTP requests # - track https urls # - minor formatting improvements # - less output buffering # - added -debug # - use http to retrieve files # (so web server can process SSIs) # Nov 1999 - fixed port # handling in MakeLocal, -islocal # - try HTTP HEAD request if file or directory does not exist # - add timeout for http requests # - deleted some options # - cleaned up help and usage text # - fix printing of urls in html output # Jan 2000 - fix Y2K bug in PrintTimeStamp # Feb 2000 - follow links to .shtml files # Jul 2000 - allow single-quoted values in src=,href= # Mar 2003 - report on files avoided (wrong user, group, ...) # Sep 2004 - report user/group names instead of ids # =================================================================== # DISCLAIMER: This software is provided *AS IS*, with no warranty # of any kind. # =================================================================== #--------------------------------------------- # Configuration: # Files to try in case of a directory reference like ../ @DefaultFiles = ('index.html','index.htm','index.shtml','index.php'); @Extensions = ('html','htm','shtml','php'); $MyHost = "www.usask.ca"; $MyPort = 80; $ServerRoot = "/var/www/"; #$SiteRoot = "/cwis/www"; #--------------------------------------------- # Things you shouldn't need to change $BaseURL = "http://$MyHost"; $BaseURL .= ":$MyPort" if $MyPort != 80; $MaxDots = 50; # number of +/-'s per line max $WebxrefReferer = '--webxref--'; $WebxrefAgent = "WebXRef/$version"; $| = 1; # unbuffered output for progress indicator $debug = 0; # Bugs? haha! hm. &GetParams; &InitStatusMessages; # Hello print < Webxref output

EOM print <
EOM # If interrupted print output so far # NOTE: This is unreliable if webxref was interrupted # asynchronously. The C-library is not re-entrant, so # if printing was in progress printing may well fail # due to malloc running into trouble. Oh well. It does # work sometimes. $SIG{INT} = 'InterruptHandler' if (! $NoInterrupt); # If no root is supplied as a parameter, use current directory $SiteRoot = &GetCWD unless defined $SiteRoot; $SiteRoot .= '/' unless $SiteRoot =~ m#/$#; chdir($SiteRoot) || die "Cannot cd to $SiteRoot: $!\n"; print "
\n" if ($HTMLReport);

foreach (@ARGV) {
  # Does the input file exist at all?
  if (! -e $InFile) {
    &AddedToList(*LostFileList,$InFile,$WebxrefReferer);
    print "Cannot find file $InFile\n";
    next;
  }

  if ($User || $Group) {
    ($uid,$gid) = (stat($InFile))[4,5];
    $User = $uid if $User;
    $Group = $gid if $Group;
  }

  $InFile = $SiteRoot . $_;
  print "\nChecking $InFile\n\n" if $debug;
  ($TopDir,$f,$a,$RootDepth) = SplitFile($InFile);
  $MaxDepth += $RootDepth - 1;
  print "Maxdepth=$MaxDepth\n" if $debug;
  print "Local Host and port: $MyHost $MyPort\n" if $debug;
  &GetFluffFiles($TopDir) if ($Fluff);

  &GetReferences($InFile,$WebxrefReferer,$SiteRoot);
}

# See if there are any never-referenced files
&PickFluff if ($Fluff);

print "
\n" if ($HTMLReport); &PrintLists; $PrintedLists = 1; # Check external URLs if (($Do_External_URLs) && (%HTTPList)) { $DotCount = 0; print "\n\nExternal HTTP checking starts\n" if ($Dots); print "

\n" if $Dots && $HTMLReport; if (!$Silent) { print <<"E_O_T" if !$HTMLReport; - - - - - - - - - - - - - - - - - - - - - - - - - - - Going to really check external URLs via the network. This may take some time. Simply abort webxref if you are out of patience. - - - - - - - - - - - - - - - - - - - - - - - - - - - E_O_T print <<"E_O_T" if $HTMLReport;

Going to really check external URLs via the network. This may take some time. Simply abort webxref if you are out of patience.

E_O_T } &Check_External_URLs(*HTTPList, "Checking external URLs:"); print "\nAll done.\n" if (!$Silent); } print < This version ($version) by Earl Fogel EOM print < Webxref was originally written by Rick Jansen (rick\@sara.nl)
This version ($version) by
Earl Fogel EOM exit; # End of webxref #--------------------------------------------- sub InterruptHandler { # Called on interrupt # Print output accumulated so far and exit $| = 0; print "\n\n** Interrupt!\n"; #$interrupted++; &PrintLists unless $PrintedLists; exit; } #--------------------------------------------- sub GetParams { # Defaults $Do_External_URLs = 1; # Default we do check external URLs $HTML_only = 1; # If 0, referenced files are checked for links # even if the file has no .html extension $Avoid = ""; # Regexp to avoid certain URLs, files,... $Silent = 1; # If silent=1 only error msgs will be printed $Verbose = 0; # Default as little output as possible $Dots = 1; # Print a + for every file checked (- if failed) $Errors = 0; # Print error messages only $Xref = 1; # Generate cross references $MaxDepth = 100; # Default max nesting level $Fluff = 0; # Do not check for unreferenced files $LongReport = 0; # Default is no reports $FullPath = 0; # If true print full filepaths $HTTPDelay = 1; # default 1 second between external link checks $HTTPTimeout = 5; # number of seconds to wait for http requests $Time = '000000'; $NoInterrupt = 0; # Default webxref is interruptable $InFile = ""; while ($ARGV[0] =~ /^-/) { if ($ARGV[0] eq "-help") { &PrintHelp; } if ($ARGV[0] eq "-h") { &PrintHelp; } elsif ($ARGV[0] eq "-noxref") { $Xref = 0; } elsif ($ARGV[0] eq "-xref") { $Xref = 1; } elsif ($ARGV[0] eq "-onexref") { $OneXref = 1; } elsif ($ARGV[0] eq "-x") { $Xref = 1; } elsif ($ARGV[0] eq "-forcehtml") { $HTML_only = 0; } elsif ($ARGV[0] eq "-nohttp") { $Do_External_URLs = 0; } elsif ($ARGV[0] eq "-delay") { $HTTPDelay = &NextArg; } elsif ($ARGV[0] eq "-timeout") { $HTTPTimeout = &NextArg; } elsif ($ARGV[0] eq "-nofluff") { $Fluff = 0; } elsif ($ARGV[0] eq "-fluff") { $Fluff = 1; } elsif ($ARGV[0] eq "-dots") { $Dots = 0; } elsif ($ARGV[0] eq "-silent") { $Silent = 1; $Dots = 0; } elsif ($ARGV[0] eq "-s") { $Silent = 1; $Dots = 0; } elsif ($ARGV[0] eq "-verbose") { $Verbose++; $Silent = 0; $Dots = 0; $Errors = 1; } elsif ($ARGV[0] eq "-noint") { $NoInterrupt = 1; } elsif ($ARGV[0] eq "-v") { $Verbose++; $Silent = 0; $Dots = 0; $Errors = 1; } elsif ($ARGV[0] eq "-errors") { $Errors = 1; } elsif ($ARGV[0] eq "-error") { $Errors = 1; } elsif ($ARGV[0] eq "-e") { $Errors = 1; } elsif ($ARGV[0] eq "-long") { $LongReport = 1; } elsif ($ARGV[0] eq "-html") { $HTMLReport = 1; } elsif ($ARGV[0] eq "-debug") { $debug = 1; } elsif ($ARGV[0] eq "-islocal") { $IsLocal = &NextArg; } elsif ($ARGV[0] eq "-avoid") { $Avoid = &NextArg; } elsif ($ARGV[0] eq "-a") { $Avoid = &NextArg; } elsif ($ARGV[0] eq "-root") { $SiteRoot = &NextArg; } elsif ($ARGV[0] eq "-r") { $SiteRoot = &NextArg; } elsif ($ARGV[0] eq "-fullpath") { $FullPath = 1; } elsif ($ARGV[0] eq "-depth") { $MaxDepth = &NextArg; } elsif ($ARGV[0] eq "-date") { $Date = &NextArg; } elsif ($ARGV[0] eq "-time") { $Time = &NextArg; } elsif ($ARGV[0] eq "-before") { $Before = 1; } elsif ($ARGV[0] eq "-after") { $After = 1; } elsif ($ARGV[0] eq "-user") { $User = 1; } elsif ($ARGV[0] eq "-group") { $Group = 1; } elsif ($ARGV[0] eq "-up") { $Up = 1; } elsif ($ARGV[0] eq "-intermediair") { $Intermediair = 1; } # Special case else {&PrintUsage; exit;} shift @ARGV; } $InFile = $ARGV[0]; if ($InFile eq "") { &PrintUsage; exit; } if ($After || $Before) { die "You cannot specify both -before AND -after\n" if ($Before && $After); die "-time format must be hhmmss or hhmm" unless ($Time =~ /\d\d\d\d(\d\d)?/); die "-date format must be yymmdd or yymm" unless ($Date =~ /\d\d\d\d(\d\d)?/); $Before = 0 if ($After); # We don't use $After $TimeStamp = &ConvertTimeStamp($Date,$Time); } if ($IsLocal) { $IsLocal =~ s#^http://##i; $IsLocal =~ s#/.*##; if ($IsLocal =~ m#^([-\w\.]+):(\d+)$#i) { $MyHost = $1; $MyPort = $2; } elsif ($IsLocal =~ m#^([-\w\.]+)$#i) { $MyHost = $1; } elsif ($IsLocal =~ m#^[-\w\.]+$#i) { $MyHost = $IsLocal; } } } # GetParams sub NextArg { shift @ARGV; return $ARGV[0]; } # NextArg sub PrintUsage { print <<"E_O_T"; Usage: webxref file ... Webxref has many options. Use -help to see a full list. The most common options are: What to Check: -fluff Check for files/directories which are never used. -depth Limit recursive checking of other files: 0 means current file only, 1 means current directory, 2 adds sub-directories, 3 adds sub-sub-directories, and so on. -nohttp Do not check external URLs via the network. What to Report: -dots Don't show +- progress indicators. -verbose/-v Show more detailed progress indicators. -long List all files found, not just problems. -noxref Do not list which files reference files. -html Print report in html format, links and all. E_O_T } sub PrintHelp { print <<"E_O_T"; ========================================= Which parameters to use for what purpose: ========================================= Default webxref checks the given file and follows the links in that file. While working it lets you know it's alive by printing a '+' for each file checked ok, and a '-' for each file with a problem. A webxref run can take some time. You can, however, interrupt webxref with ctrl-c (Unix). Webxref will report on the files it has inspected up to that moment and exit. (*New!*) (Note: this is not reliable! webxref is not interruptable at any time, due to the C-libraries not being re-entrant. (This probably does not interest you at all, but it's not the author's fault.)) Specify -noint if you don't want webxref to try and generate output after an interrupt. When the whole site has been searched and all links have been inspected webxref prints a report. Default only problems are reported. Specify -long to obtain a long report. Specify -html to get a report in HTML form. If you want more information while webxref is working specify -verbose to get messages on every file or -errors to see only files with problems. With -silent webxref prints nothing at all while working. Webxref keeps track of which html-documents are being linked to from other documents. This is called cross-referencing, hence webxref's name. If you are not interested in this, specify -noxref, so you won't be told where things have failed and probably have to run webxref again. If you're just interested in one location where a file is referenced specify -onexref. This saves memory too. If you need to know if there are files and/or directories in your site that are not referenced at all by any pages in your site specify -fluff. If you want to check for links inside files that do not have the .html or .htm extension, specify -forcehtml Use -root to tell Webxref the location of your web server's document root. File names are abbreviated, that is /u/people/rick/www/a.html is printed as "a.html" if webxref is called from ~/rick/www. If you specify -fullpath you'll get the full paths. If you use full URLs in your site referring to your own site, say "www.sara.nl" is your www-address and you use links like then tell webxref that "www.sara.nl" actually can be found on the local machine with: -islocal 'www.sara.nl' If you want to avoid certain files use the -avoid parameter to specify which files to avoid. If you want to limit the number of files webxref inspects you may want to limit the scan to 1 or 2 directories deep in the file system. If you specify -depth 1 only files in the current directory are inspected. If you just want to check if links in a file are valid specify -depth 0. Only the links present in the file are tested, but no more. After all local files are inspected webxref goes out into the net to check if the http:// links work. This may be time-consuming. Specify -nohttp if you don't want that. To avoid overloading a webserver there is a delay of 1 second between checks. If you want longer or shorter delays specify the number of seconds with -delay. (Longer delays may be necessary if a lot of links refer to the same webserver.) To see if you have files or directories that were modified last before or after a certain date/time use: -before/-after -date yymmdd -time hhmmss. If -before is given files are reported that were modified before the date given, with -after files last modified after the date given are reported. Use -user or -group to only inspect files with the same owner or group as the original. Use -up if you want to inspect files in parents of the original directory. To tell webxref which files to inspect simply list the file or files at the end of the command. ======================= What the parameters do: ======================= While checking webxref prints output according to: -dots Don't show +- progress indicators. -silent/-s Only list files with problems at the end of the run. -verbose/-v Print information while checking files. -errors/-e Print errors when they occur, even when -silent. -noint Do not generate output on interrupt Webxref generates a report according to: -long List all files found, not just problems. -xref/-x List which files reference files (default). -noxref Do not list which files reference files (cross-references). -html Print report in html form, links made active and all. -fullpath Print full-length filenames, e.g. /u/people/rick/www.html Webxref inspects files/directories according to: -fluff List which files/directories are never used. -forcehtml Inspect files without the .html/.htm extension. -root rootdir Your web server's document root, i.e. the directory corresponding to . -islocal url 'www.mymachine.nl' is actually a local file reference. -avoid regexp Avoid files with names matching regexp for inspection. -depth number The maximum directory nesting level. 0 means: current file only, 1 means: current directory, 2 means: curent directory and 1 level of sub-directory, 100 probably means there is no restriction in how deep webxref is allowed to find files. -nohttp Do not check external URLs via the network. -delay seconds Seconds to wait between HTTP requests (default $HTTPDelay) -timeout secs Maximum seconds to wait for HTTP requests (default $HTTPTimeout) -date -time Date [yymm

], time [hhmm]. -before -after List files that are modified before or after the date/time given with -date and -time. ======== Examples ======== webxref file.html Checks file.html and files/URLs referenced from file.html Only lists problems at the end of the run, + and - for each file checked. webxref index.html another.html checks index.html and another.html webxref -depth 0 index.html just check the links in index.html, don't follow the links webxref -depth 1 index.html Check index.html and other files in the same directory webxref -nohttp file.html Checks file.html, but not external URLs webxref -forcehtml file.txt Checks file.txt, even though it doesn't end in .html/htm webxref -avoid '.*Archive.*' file.html Checks file.html but avoids files with names containing 'Archive' webxref -islocal www.sara.nl Treat things like '= $MaxDots) { print "\n"; $DotCount = 0; } } # PrintDot sub GetReferences { # -- GetReferences($link,$referer,cwd) # Get all references from the link(file) and check those recursively. # Link can be a file, or a ref in the form http:// etc # Note: the files referenced are kept as full filesystem paths # to those files. This is done to ensure that references to # the file /u/user/file.html is the same as a reference "../file.html" local($link,$referer,$cwd) = @_; #print "\nGetRefs:\n $link\n Referer: $referer\n" if $debug; # These vars are pushed onto the stack each recursive call local($dir,$file,$anchor); local($Old_Dir); local($filename); local($upref); ($method,$rest) = SplitURL($link); #print "GetReferences link: $link method: $method\n" if $debug; if ($method eq 'http') { return unless &AddedToList(*HTTPList, $link, $referer); } elsif ($method eq 'https') { return unless &AddedToList(*HTTPSList, $link, $referer); } elsif ($method eq 'ftp') { return unless &AddedToList(*FTPList, $link, $referer); } elsif ($method eq 'telnet') { return unless &AddedToList(*TelnetList, $link, $referer); } elsif ($method eq 'gopher') { return unless &AddedToList(*GopherList, $link, $referer); } elsif ($method eq 'mailto') { return unless &AddedToList(*MailList, $link, $referer); } elsif ($method eq 'news') { return unless &AddedToList(*NewsList, $link, $referer); } elsif ($method eq 'CGI') { ; } elsif ($method eq 'file') { ; } else { print "Unknown method \"$method\": $link\n"; } # If CGI just try to check if the script is present if ($method eq 'CGI') { # Delete parameters of cgi script $link =~ s/\?.*//; if (-e "$ServerRoot$link" || -e "${ServerRoot}share/$link") { return unless &AddedToList(*CGIList, $link, $referer); } else { return unless &AddedToList(*LostCGIList, $link, $referer); } return; } # cgi return if ($method ne 'file'); # Apparently what we have ended up with at this point # is a reference to a file of some sort. This "file" # can also still be a directory. It can also be a name # anchor in the file. ($dir,$file,$anchor,$depth) = &SplitFile($link); #print "split: $dir $file $anchor $depth\n" if $debug; # Apply the regexp to avoid certain files if ($Avoid) { #print "Avoid: $Avoid File: $file Dir: ",&PrintFile($dir),"\n"; if (&PrintFile($file) =~ m/$Avoid/o) { print " Avoided file ",&PrintFile($file),"\n" if (!$Silent) || ($Errors); return; } if (&PrintFile($dir) =~ m/$Avoid/o) { print " Avoided directory ",&PrintFile($dir),"\n" if (!$Silent) || ($Errors); return; } } if ($dir eq "") { $dir = $cwd; $dir .= '/' unless $dir =~ m#/$#; } # Only check links in files/directories below the starting directory # We have to test for these files here, but don't report on it til # later so we can check if they really exist and are readable, etc. if (!$Up && $dir !~ /^$TopDir/) { $upref = 1; } else { $upref = 0; } # Move to the specified directory to obtain the expanded # file path if (-d $dir) { $Old_Dir = $cwd; chdir($dir); $filename = $dir . $file; } else { my($fname) = $link; $fname =~ s#^$SiteRoot#/#i; $status = http_request($MyHost,$MyPort,'HEAD',$fname); if ($status > 0 && $status < 400) { # alias, redirect, scriptalias, ... &PrintDot('+') if ($Dots); print "HEAD $fname status $status $FailStatusMsgs{$status}\n" if $debug; &AddedToList(*FileList,$filename,$referer); return; } &PrintDot("-") if ($Dots); print "\n" if ($Errors && $Dots); print " ", &PrintFile($dir.$file)," cannot be found\n" if (!$Silent) || ($Errors); print " Referenced by: ",&PrintFile($referer),"\n" if (!$Silent) || ($Errors); &AddedToList(*LostFileList,$dir.$file,$referer); return; } # # Add to the list of already tested files if not inspected before # # If the "file" is a directory try Welcome/welcome/index.html etc. if (-d $filename) { return unless &AddedToList(*DirList,$filename,$referer); &DoDirectory; } # Not found? if (! -f $filename) { # try the default extensions: html, htm etc. $SecondChance = &TryExtensions($filename); if ($SecondChance) { # Apparently adding an extension did the trick $filename = $SecondChance; } else { my($fname) = $link; $fname =~ s#^$SiteRoot#/#i; $status = http_request($MyHost,$MyPort,'HEAD',$fname); if ($status > 0 && $status < 400) { # alias, redirect, scriptalias, ... &PrintDot('+') if ($Dots); print "HEAD $fname status $status $FailStatusMsgs{$status}\n" if $debug; &AddedToList(*FileList,$filename,$referer); return; } &PrintDot("-") if ($Dots); print "\n" if ($Errors && $Dots); print " ", &PrintFile($filename)," cannot be found\n" if (!$Silent) || ($Errors); print " Referenced by: ",&PrintFile($referer),"\n" if (!$Silent) || ($Errors); # Add to list of lost files &AddedToList(*LostFileList,$filename,$referer); return; } } # World readable? (do not use -r, doesn't work for root) ($dev,$ino,$mode,$nlink, $uid,$gid, $rdev,$size, $atime,$mtime,$ctime, $blksize,$blocks) = stat($filename); $readmode = ($mode & 4); if ($readmode == 0) { # Not world readable, add to list &AddedToList(*UnreadableList,$filename,$referer); } # Check if we need to list this file &CheckTimeStamp($filename,$mtime) if ($Before || $After); # Binary file? (pictures,...) if (-B $filename) { &AddedToList(*ImageFileList,$filename,$referer); return; } # Filename *must* have extension .html or .htm, else we don't inspect it. if ($HTML_only && $filename !~ /.*\.(s?html?|php)$/i) { &AddedToList(*OtherFileList,$filename,$referer); return; } # Apply the regexp to avoid certain files if ($Avoid && ($filename =~ m/$Avoid/)) { #print "** The above file is avoided.\n" if (!$Silent) || ($Errors); return; } # we don't recursively check links in these files if ($upref) { if (AddedToList(*AvoidFileList,$filename,$referer)) { print "** Skipping $filename (not in $TopDir).\n" if ($Verbose>1); } return; } # Only check files owned by this user/group if ($User && ($uid != $User)) { if (AddedToList(*AvoidFileList,$filename,$referer)) { my ($name) = getpwuid($uid); $uid = $name if $name; print "** Skipping $filename (user $uid).\n" if ($Verbose>1); } return; } if ($Group && ($gid != $Group)) { if (AddedToList(*AvoidFileList,$filename,$referer)) { my ($name) = getgrgid($gid); $gid = $name if $name; print "** Skipping $filename (group $gid)\n" if ($Verbose>1); } return; } # Hold it, we're getting too deep. ($dummy,$dummy,$dummy,$depth) = SplitFile($filename); if ($depth > $MaxDepth && $filename ne $InFile) { # print "Skip $filename, Depth ($depth) > MaxDepth ($MaxDepth)\n" # if ($depth > $MaxDepth && $debug); $TooDeepList{$filename} = 1; return; } # else it's a text (html)file # return if we already inspected it return unless &AddedToList(*FileList,$filename,$referer); # See which files are in this directory if we are checking # for unreferenced files, if not done before &GetFluffFiles($dir) if ($Fluff && (! $FluffScannedDirList{$dir})); # We now have a (html) file to check for further links, name anchors etc. print "Checking: ",&PrintFile($filename),"\n" if (!$Silent); &PrintDot('+') if ($Dots); local(@newlist) = GetLinks($filename); #print "done getlinks from ",&PrintFile($filename),":\n", join("\n",@newlist),"\n"; # Now see if the anchor we were after was found if ($anchor) { if (!defined($AnchorList{"$filename$anchor"})) { return unless &AddedToList(*LostAnchorList,"$filename$anchor",$referer); print " Anchor ",&PrintFile($anchor)," was NOT found in file ", &PrintFile($filename),"\n" if (!$Silent) || ($Errors); print "\n" if ($Errors && $Dots); &PrintDot('-') if ($Dots); } else { # Anchor found, add referer AddedToList(*AnchorList,"$filename$anchor",$referer); } return; } # Walk the list and check everything is there foreach $file (@newlist) { # Some http:// references are really local files $file = MakeLocal($file); ($method,$rest) = SplitURL($file); if ($method eq 'file') { if ($file =~ m#^/#) { # root reference $file =~ s#^/## if $SiteRoot =~ m#/$#; $Notlocal_file = "$SiteRoot$file"; } else { # Reference relative to directory $Notlocal_file = $dir . $file; } } else { # Not file but http, news etc $Notlocal_file = $file; } $Notlocal_ref_filename = $filename; # Prevent from recursing if there's an easy check... next if (&AlreadyChecked($Notlocal_file,$Notlocal_ref_filename)); &GetReferences($Notlocal_file, $Notlocal_ref_filename,$dir); } # foreach chdir($Old_Dir) if ($dir ne $Old_Dir); } #sub GetReferences sub GetLinks { # -- GetLinks($file) # Read the html file and extract all links # returns @Newlist local($filename) = @_; local(@Tags,$tag,%Newlist,@Newlist,%LocalAnchorsFound,%LocalAnchorsWanted); # Read the file into a big string and remove crud in between tags. #print "opening $filename\n"; #unless (open(HTML, $filename)) { # print "Could not open file $filename\n"; # return; #} #undef($/); # no input record separator #$_ = ; #$/ = "\n"; # input record separator is newline #close(HTML); #print "File: $filename\n"; ($fname = $filename) =~ s/$SiteRoot/\//; $status = http_request($MyHost,$MyPort,'GET',"$fname",$_); #print "status=$status http://$MyHost$fname\n"; if ($status < 0 || $status >= 300) { &PrintDot('-') if ($Dots); print " Could not retrieve $file: $status $FailStatusMsgs{$status}\n" # if (!$Silent) || ($Errors) ; return; } s/\n/ /g; # Remove html comments s///g; @Tags = split(/.*//; #print "tag: $_\n"; # ) if ($Link =~ m/^#/) { #print " -$filename$Link- wanted\n"; # Special case for Intermediair: do not generate error for "href=file.html#" # (empty name anchor) #print " -$Link- wanted\n"; if ($Intermediair) { next if ($Link eq "#"); } $LocalAnchorsWanted{"$filename$Link"} = 1; } # Link to another document? a href=file.html#anchor elsif ($Link =~ m/#/) { $Link =~ m/(.+)#(.+)/; #print "LINK: $Link $1 $file - equal?\n"; if ($1 eq $file) { # Current file after all $LocalAnchorsWanted{("$filename" . '#' . "$2")} = 1; } else { $Newlist{$Link} = 1; } } else { # Just a file ref $Newlist{$Link} = 1; } } if (m#name\s*=\s*["']?([^"'\s]*)["']?#i) { # -- a name=... $Link = $1; #print " name: $Link\n"; #print " -$filename$Link- found\n"; $LocalAnchorsFound{"$filename#$Link"} = 1; } } # frame src= elsif (/^frame\s+/i) { if (m#src\s*=\s*["']?([^"'\s]*)["']?#i) { $Link = $1; #print " frame: $Link\n"; $Newlist{$Link} = 1; } else { print " Frame parse error in ",&PrintFile($filename),":\n $_\n"; } } # area href= elsif (/^area\s+/i) { if (m#href\s*=\s*["']?([^"'\s]*)["']?#i) { $Link = $1; #print " area: $Link\n"; $Newlist{$Link} = 1; } else { print " Area parse error in ",&PrintFile($filename),":\n $_\n"; } } # link href= elsif (/^link\s+/i) { if (m#href\s*=\s*["']?([^"'\s]*)["']?#i) { $Link = $1; #print " link: $Link\n"; $Newlist{$Link} = 1; } } # = $TimeStamp); } } # CheckTimeStamp sub PrintTimeStamp { # -- PrintTimeStamp($mtime) # Returns date/time string: yymmdd hh:mm:ss local($mtime) = @_; local($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($mtime); $hour = '00' unless $hour; $min = '00' unless $min; $sec = '00' unless $sec; $year += 1900; $mon = "0$mon" if ($mon =~ /^\d$/); $mday = "0$mday" if ($mday =~ /^\d$/); return "$year/$mon/$mday at $hour:$min:$sec"; } # PrintTimeStamp sub ConvertTimeStamp { # -- ConvertTimeStamp($date,$Time) # Convert $Date and $Time given as parameters to tm struct # Date: yymm(dd) Time: hhmm(ss) # Perl library module use Time::Local; local($Date,$Time) = @_; local($year,$month,$day,$hour,$min,$sec); $Date =~ m/(\d\d)(\d\d)(\d\d)?/; $year = $1; $month = $2-1; $day = 1; $day = $3 if defined($3); $hour = '00'; $min = '00'; $sec = '00'; $Time =~ m/(\d\d)(\d\d)(\d\d)?/; $hour = $1; $min = $2; $sec = 0; $sec = $3 if defined($3); #print "\n\nhour $hour\n"; #print "min $min\n"; #print "sec $sec\n"; #print "day $day\n"; #print "mon $month\n"; #print "year $year\n"; #print "ts:", timelocal($sec,$min,$hour,$day,$month,$year); # From Perl library module return timelocal($sec,$min,$hour,$day,$month,$year); } # ConvertTimeStamp #--------------------------------------------- sub Check_External_URLs { local(*list, $header) = @_; local($URL); print "\n\n----------------\n$header\n" if !$Silent && !$HTMLReport; print "

$header

\n" if !$Silent && $HTMLReport; @TheList=keys(%list); @SortedList = sort @TheList; foreach $URL (@SortedList) { if (!$Silent) { print "$URL \n"; } if (defined($HTTP_OK_List{$URL}) || defined($HTTP_Fail_List{$URL})) { # Already checked on this one next; } else { sleep($HTTPDelay); $rcode = &Check_URL($URL); } if (defined($OkStatusMsgs{$rcode})) { # URL is ok, server responds and all. if (!$Silent) { print " Ok\n"; } &PrintDot('+') if ($Dots); $HTTP_OK_List{$URL} = $HTTPList{$URL}; # The references } else { # Something is wrong. if (defined($FailStatusMsgs{$rcode})) { &PrintDot('-') if ($Dots); print "\n" if ($Errors && $Dots); print " Failed: $FailStatusMsgs{$rcode}\n" if (!$Silent) || ($Errors); } else { &PrintDot('-') if ($Dots); print "\n" if ($Errors && $Dots); print " Failed with code $rcode\n" if (!$Silent) || ($Errors); } $HTTP_Fail_List{$URL} = $rcode; } print "
\n" if (!$Silent && $HTMLReport); } $| = 0; # turn off output buffering &PrintList(%HTTP_OK_List,"URLs checked ok:") if (!$Silent); &Print_Failed_URL_List(*HTTP_Fail_List, "Failed URLs:"); } sub Check_URL { # http://host:port/path local($URL) = @_; if ($URL !~ m#^http://.*#i) { print "wrong format url '$URL'!\n"; return; } else { # Get the host and port #if ($URL =~ m#^http://([\w-\.]+):?(\d*)($|/(.*))#i) { if ($URL =~ m#^http://([^:/]+):?(\d*)($|/(.*))#i) { $host = $1; $port = $2; $path = $3; #print "URL:$URL host:$host port:$port path:$path\n"; } else { print "Unrecognized URL: $URL\n"; return; } if ($path eq "") { $path = '/'; } if ($port eq "") { $port = 80; } # Delete name anchor. (check if the anchor is present in the doc?) $path =~ s/#.*//; # Delete parameters #$path =~ s/\?.*//; #print "-->\n URL: $URL\n host: $host\n port: $port\n path: $path\n"; } $status = http_request($host,$port,'HEAD',$path); return $status; } # # Make an HTTP request # # returns the status code # # If a 5th argument is given, we use it to store # the HEAD or BODY of the HTTP reply. # sub http_request { my($host,$port,$method,$path) = @_; local($_); my($response,$wantresponse,$status); my($hostname,$name,$aliases,$protocol); if ($#_>3) { $wantresponse = 1; $_[4] = ""; } # The following is largely taken from the Camel book, chapter 6 $AF_INET = 2; $SOCK_STREAM = 1; $sockaddr = 'S n a4 x8'; # (Bless you) chop($hostname = `hostname`); ($name,$aliases,$proto) = getprotobyname('tcp'); ($name,$aliases,$port) = getservbyname($port,'tcp') unless $port =~ /^\d+$/; ($name,$aliases,$type,$len,$thisaddr) = gethostbyname($hostname); if (!(($name,$aliases,$type,$len,$thataddr) = gethostbyname($host))) { print "gethostbyname failed: $!
\n"; return -1; } $this = pack($sockaddr, $AF_INET, 0, $thisaddr); $that = pack($sockaddr, $AF_INET, $port, $thataddr); # # setup alarm so we can timeout if things take too long # $retval = eval { local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required alarm $HTTPTimeout; # Make the socket filehandle. # ** Temporary fix, this is NOT The way to do it. 15-APR-96 # But we'll still use it anyway, cannot rely on Perl to be # installed correctly everywhere. if (!(socket(S, $AF_INET, $SOCK_STREAM, $proto))) { $SOCK_STREAM = 2; if (!(socket(S, $AF_INET, $SOCK_STREAM, $proto))) { return -2; } } # Give the socket an address if (!(bind(S, $this))) { return -3; } if (!(connect(S,$that))) { return -4; } select(S); $| = 1; select(STDOUT); print S "$method $path HTTP/1.0\r\n"; print S "Host: $host\r\n"; print S "User-Agent: $WebxrefAgent\r\n"; print S "\r\n"; $response = ; ($protocol, $status) = split(/\s+/, $response); $_[4] .= $response if $method eq 'HEAD' && $wantresponse; if ($status !~ /^[0-9]+$/) { ($status = $response) =~ s/<.*?>//g; chomp $status; } while () { # read rest of HTTP header last if /^\r?$/; $_[4] .= $_ if $method eq 'HEAD' && $wantresponse; } if ($method ne 'HEAD') { while () { # read body of response $_[4] .= $_ if $method ne 'HEAD' && $wantresponse; } } alarm 0; }; if ($@) { # timeout or something $status = -5; } elsif ($retval<0) { # connect failed or something $status = $retval; } close(S); #print "Status: $status\n"; #print "reply: $_[4]\n" if $wantresponse; #print "NO reply buffer\n" unless $wantresponse; return $status; } #--------------------------------------------- # # Sort the list and remove doubles # sub SortUnique { local(%list); # Remove doubles foreach (@_) { $list{$_} = 1; } return (sort(keys(%list))); } # SortUnique # # Returns filename without the root path # sub PrintFile { local($file) = @_; $file = '/' if $file eq $SiteRoot; $file =~ s#^\Q$SiteRoot\E##o if (!$FullPath); # delete root from path return $file; } # # Convert absolute filename into relative url # sub PrintUrl { local($url) = @_; $url = &PrintFile($url); $url = '/' . $url unless $url =~ m#://# || $url =~ m#^/#; return $url; } sub Print_Failed_URL_List { local(*list, $header) = @_; local(@SortedList); local(@SortedReferList); local($URL,$lostURL); # Don't list empty lists if (! %list) {return}; print "
\n" if ($HTMLReport);

print "\n\n", '-' x length($header);
print "\n$header\n";
print '-' x length($header), "\n";

print "\n" if ($HTMLReport);

#@SortedList = sort(keys(%list));
@SortedList = sort {$HTTP_Fail_List{$a} <=> $HTTP_Fail_List{$b}} (keys(%list));
foreach $URL (@SortedList) {
  if ($HTMLReport) {
    print "$URL\n";
  }
  else {
    print "$URL \n";
  }
  $rcode = $HTTP_Fail_List{$URL};
  print "  Status: $rcode ($FailStatusMsgs{$rcode})\n";

  if ($Xref) {
    @SortedReferList = split(/ /,$HTTPList{$URL});
    @SortedReferList = &SortUnique(@SortedReferList);
    print "    Referenced by:\n";
    foreach $lostURL (@SortedReferList) {
      if ($HTMLReport) {
        print "    ",&PrintFile($lostURL),"\n";
      }
      else {
        print "    ",&PrintFile($lostURL),"\n";
      }
    }
  }  # $Xref
}

}  # sub Print_Failed_URL_List




sub PrintList {

local(*list, $header, $NoXref) = @_;

local($file,$i);
local(@SortedFileList);
local(@SortedReferList);

# Don't list empty lists
return if (! %list);

@SortedFileList = sort(keys(%list));

# Append number to header
$header = "$header ".($#SortedFileList+1);

print "
" if $HTMLReport;

print "\n", '-' x length($header);
print "\n$header\n";
print '-' x length($header), "\n";

print "\n" if $HTMLReport;

foreach (@SortedFileList) {
  if ($HTMLReport) {
    print "",&PrintFile($_),"\n";
  }
  else {
    print &PrintFile($_),"\n";
  }

  if ($Xref || $OneXref) {
    @SortedReferList = split(/ /,$list{$_});
    @SortedReferList = &SortUnique(@SortedReferList);
    print "  Referenced by:\n";
    foreach $i (@SortedReferList) {
      if ($HTMLReport && $i ne $WebxrefReferer) {
	print "    ",&PrintFile($i),"\n";
      } else { print "    ",&PrintFile($i),"\n"; }

      last if ($OneXref);
    }
    #print "\n";
  }  # $Xref
}

print "
\n" if ($HTMLReport); # CAUTION! TRY TO RETRIEVE MEMORY 13.3.97 # undef %list unless $interrupted; } # sub PrintList sub PrintFluff { # The Fluff list is an array, not a hash local(*list, $header) = @_; # Don't list empty lists return if (! @list); # Append number to header $header = "$header ".($#list+1); print "
" if ($HTMLReport);

print "\n", '-' x length($header);
print "\n$header\n";
print '-' x length($header), "\n";

print "\n" if ($HTMLReport);

foreach (@list) {
  if ($HTMLReport) {
    print "",&PrintFile($_),"\n";
  } else {
    print &PrintFile($_)," \n";
  }
}

print "
\n" if ($HTMLReport); } # PrintFluff sub PrintLists { $| = 0; # turn off output buffering # List all files found &PrintList(*FileList,"Web documents found:",0) if $LongReport; # List of directories referenced &PrintList(*DirList,"Directories:",0) if $LongReport; # List of images referenced &PrintList(*ImageFileList,"Images:",0) if $LongReport; # List of mailto's &PrintList(*MailList,"Mailto:",0) if $LongReport; # List of ftp's &PrintList(*FTPList,"ftp:",0) if $LongReport; # List of https's &PrintList(*HTTPSList,"https:",0) if $LongReport; # List of telnets &PrintList(*TelnetList,"telnet:",0) if $LongReport; # List of gophers &PrintList(*GopherList,"gopher:",0) if $LongReport; # List of news &PrintList(*NewsList,"News:",0) if $LongReport; # List of http's &PrintList(*HTTPList,"External URLs:",0) if $LongReport; # List of files skipped for various reasons &PrintList(*AvoidFileList,"Files Skipped:",0) if $LongReport; # List of file:'s # &PrintList(*ExtFileList,"External file:",0) if $LongReport; # List of cgi-bin scripts/forms &PrintList(*CGIList,"cgi-bin scripts/forms:",0) if $LongReport; # List of name anchors &PrintList(*AnchorList,"Name anchors found:",0) if $LongReport; # ---- PROBLEM SECTION ----- # List of files that can't be found &PrintList(*LostFileList,"Files not found:",0); # List of cgi-bin scripts/forms not found &PrintList(*LostCGIList,"cgi-bin scripts/forms not found:",0); # List of files that are not world readable # Override Xref, as reference list for non-world-readable files is not kept &PrintList(*UnreadableList,"Files not world readable:",1); # List of name anchors not found &PrintList(*LostAnchorList,"Name anchors not found:",0); # List of files found in the directories but not referenced &PrintFluff(*FluffFiles,"Files not referenced:"); # List of directories found in the directories but not referenced &PrintFluff(*FluffDirectories,"Directories not referenced:"); # List of files matching the date/time criterium if ($Before) { &PrintList(*TimeList, "Files last modified before: ". &PrintTimeStamp($TimeStamp).":", 1); } elsif ($After) { &PrintList(*TimeList, "Files last modified after: ". &PrintTimeStamp($TimeStamp).":", 1); } if ($HTMLReport) { print "

Finished checking links to local documents.

\n" } else { print "\nFinished checking links to local documents.\n"; } $| = 1; # turn buffering off again } #sub PrintLists sub InitStatusMessages { # HTTP status codes and messages %OkStatusMsgs = ( 200, "OK 200", 201, "Created 201", 202, "Accepted 202", 203, "Non-Authoritative 203", 204, "No Content 204", ); %FailStatusMsgs = ( -1, "Could not lookup server", -2, "Could not open socket", -3, "Could not bind socket", -4, "Could not connect", -5, "Timed out", 300, "Multiple Choices", 301, "Moved Permanently", 302, "Moved Temporarily (may need to add a /)", 303, "See Other", 304, "Not Modified", 305, "Use Proxy", 307, "Temporary Redirect", 400, "Bad request", 401, "Unauthorized", 402, "Payment Required", 403, "Forbidden", 404, "Not found", 405, "Method Not Allowed", 406, "Not Acceptable", 407, "Proxy Authentication Required", 408, "Request Timeout", 409, "Conflict", 410, "Gone", 500, "Internal Error", 501, "Not implemented", 502, "Bad Gateway", 503, "Service Unavailable ", 504, "Gateway Timeout ", ); } # sub InitStatusMessages # # convert local http://... references to local file/directory references # sub MakeLocal { my($url) = @_; my($oldurl) = $url; if ($MyHost) { if ($url =~ m#^http://$MyHost:$MyPort\b(.*)#i) { $url = $1; } elsif ($MyPort == 80 && $url !~ m#^http://$MyHost:#i && $url =~ m#^http://$MyHost(.*)#i) { $url = $1; } $url = '/' unless $url; } # print "$oldurl is local $url\n" if $url ne $oldurl && $debug; return($url); } # This is the last line of the webxref script really. # If this line is missi