ad_replicate_web_robots_db

one of the documented procedures in this installation of the ACS
Usage:
ad_replicate_web_robots_db   db
What it does:
Replicates data from the Web Robots Database (http://info.webcrawler.com/mak/projects/robots/active.html) into a table in the ACS database. The data is published on the Web as a flat file, whose format is specified in http://info.webcrawler.com/mak/projects/robots/active/schema.txt. Basically, each non-blank line of the database corresponds to one field (name-value pair) of a record that defines the characteristics of a registered robot. Each record has a "robot-id" field as a unique identifier. (There are many fields in the schema, but, for now, the only ones we care about are: robot-id, robot-name, robot-details-url, and robot-useragent.)\n

Returns the number of rows replicated. May raise a Tcl error that should be caught by the caller.

Defined in: /web/philip/tcl/ad-robot-defs.tcl

Source code:


    set web_robots_db_url [ad_parameter WebRobotsDB robot-detection]

    set result [ns_geturl $web_robots_db_url headers]
    set page [split $result "\n"]

    # A set in which to store the fields of a record as we
    # process the file.
    set robot [ns_set create]

    set robot_count 0
    foreach line $page {
	# A "robot-id" line delimits a new record, so each
	# time we encounter one, we need to write the prior
	# record (if there is one) into the database. There
	# is only case in which there will *not* be a prior
	# record, i.e., for the very first record.
	#
	if [regexp "robot-id: *(.+)" $line match robot_id] {
	    set prior_robot_id [ns_set get $robot "robot_id"]
	    if ![empty_string_p $prior_robot_id] {
		# As long as there is an actual value for
		# "robot_useragent", load the record, i.e.,
		# update it if a record with the same
		# robot_id already exists or insert it if
		# one does not. (There's no point in keeping
		# info about robots that we can't identify.)
		#
		if ![empty_string_p [ns_set get $robot "robot_useragent"]] {
		    if [robot_exists_p $db $prior_robot_id] {
			ns_log Notice "Updating existing robot: $robot_id"
			ns_db dml $db "update robots set robot_name =  '[DoubleApos [ns_set get $robot "robot_name"]]', robot_details_url = '[DoubleApos [ns_set get $robot "robot_details_url"]]', robot_useragent = '[DoubleApos [ns_set get $robot "robot_useragent"]]' where robot_id = '[DoubleApos $prior_robot_id]'"
		    } else {
			ns_log Notice "Inserting new robot: $robot_id"
			ns_db dml $db "insert into robots(robot_id, robot_name, robot_details_url, robot_useragent) values('[DoubleApos $prior_robot_id]', '[DoubleApos [ns_set get $robot "robot_name"]]', '[DoubleApos [ns_set get $robot "robot_details_url"]]', '[DoubleApos [ns_set get $robot "robot_useragent"]]')"
		    }

		    incr robot_count
		}

		# Clear out the record so we can start anew.
		#
		ns_set delkey $robot "robot_id"
		ns_set delkey $robot "robot_name"
		ns_set delkey $robot "robot_details_url"
		ns_set delkey $robot "robot_useragent"
	    }
	    ns_set put $robot "robot_id" [string trim $robot_id]
	}
	
	if [regexp "robot-name: *(.+)" $line match robot_name] {
	    ns_set put $robot "robot_name" [string trim $robot_name]
	}
	
	if [regexp "robot-details-url: *(.+)" $line match robot_details_url] {
	    ns_set put $robot "robot_details_url" [string trim $robot_details_url]
	}

	if [regexp "robot-useragent: *(.+)" $line match robot_useragent] {
	    ns_set put $robot "robot_useragent" [string trim $robot_useragent]
	}
    }

    # Don't forget the last record.
    #
    if ![empty_string_p [ns_set get $robot "robot_useragent"]] {
	if [robot_exists_p $db $prior_robot_id] {
	    ns_log Notice "Updating existing robot: $robot_id"
	    ns_db dml $db "update robots set robot_name =  '[DoubleApos [ns_set get $robot "robot_name"]]', robot_details_url = '[DoubleApos [ns_set get $robot "robot_details_url"]]', robot_useragent = '[DoubleApos [ns_set get $robot "robot_useragent"]]', insertion_date = sysdate where robot_id = '[DoubleApos $prior_robot_id]'"
	} else {
	    ns_log Notice "Inserting new robot: $robot_id"
	    ns_db dml $db "insert into robots(robot_id, robot_name, robot_details_url, robot_useragent) values('[DoubleApos $prior_robot_id]', '[DoubleApos [ns_set get $robot "robot_name"]]', '[DoubleApos [ns_set get $robot "robot_details_url"]]', '[DoubleApos [ns_set get $robot "robot_useragent"]]')"
	}

	incr robot_count
    }
    return $robot_count


philg@mit.edu