# link-check.tcl # AOLserver link verifier # this program crawls through all of your Web content and finds the dead links # it should simply be placed in "link-check.tcl" somewhere accessible through an # AOLserver 2.2 (2.1 might work also but no guarantees). Request the URL and it # will grind through the Web content. # Copyright Jin Choi (jsc@arsdigita.com) and Philip Greenspun (philg@mit.edu) # distributed under the GNU Public License global webroot global httproot global debug_link_checker global running_on_wimpy_machine set debug_link_checker 0 # if you set this to 1 then the checker will sleep for 1 second periodically # thus giving Web service a chance set running_on_wimpy_machine 0 # webroot, httproot # webroot is the Unix fully qualified path set webroot [ns_info pageroot] set httproot [ns_conn location $conn] proc ReturnHeaders {conn {content_type text/html}} { ns_write $conn "HTTP/1.0 200 OK MIME-Version: 1.0 Content-Type: $content_type " } proc check_file {f conn} { ns_write $conn "
  • $f\n\n" } proc walk_tree {dir procedure conn seen_already_cache {pattern {.*}}} { upvar $seen_already_cache seen global debug_link_checker global running_on_wimpy_machine # do this so that pwd works (so that we can avoid infinite loops) cd $dir set canonical_dirname [pwd] if [info exists seen($canonical_dirname)] { if { $debug_link_checker == 1 } { ns_write $conn "walk_tree: skipping directory $canonical_dirname (already seen)
    " } return } set seen($canonical_dirname) 1 if { $debug_link_checker == 1 } { ns_write $conn "walk_tree: checking out directory $dir
    \n" } foreach f [glob -nocomplain $dir/*] { if [file readable $f] { if [file isdirectory $f] { if { $running_on_wimpy_machine == 1 } { # we sleep for one second in order to not trash Web service ns_sleep 1 } walk_tree $f $procedure $conn seen $pattern } else { if {[file isfile $f]} { if {[ns_info winnt]} { set match [regexp -nocase $pattern $f] } else { set match [regexp $pattern $f] } if $match { $procedure $f $conn } } } } } } ## Assumes url is a URL valid for use with ns_httpopen proc get_http_status {url {use_get_p 0} {timeout 30}} { if $use_get_p { set http [ns_httpopen GET $url "" $timeout] } else { set http [ns_httpopen HEAD $url "" $timeout] } # philg changed these to close BOTH rfd and wfd set rfd [lindex $http 0] set wfd [lindex $http 1] close $rfd close $wfd set headers [lindex $http 2] set response [ns_set name $headers] set status [lindex $response 1] ns_set free $headers return $status } proc check_link {base_file reference_inside_href {use_get_p 0}} { # base_file is the full file system path where the # HTML was found; reference_inside_href is the string # that was inside the Testing Links at $httproot

    Testing Links

    at $httproot
    All HTML files:
    Jin S. Choi
    "