# search engines are too stupid to follow URLs like
# foobar.tcl?msg_id=34
# and Web standards people are too stupid to extend the
# robots.txt file format so that you can tell search engines
# to follow such links
# so here is an alternative that will allow the contents of my bboards
# to be indexed. Someone somewhere has to link to the top level page:
# /bboard/search-engine-bait.html
ns_register_proc GET /bboard/seb bboard_search_engine_bait_fetch_one
ns_register_proc GET /bboard/search-engine-bait.html bboard_search_engine_bait_top
# Scooter seems to be unhappy about following links from a page where
# it doesn't get byte count so we'll have to build one huge ns_return
proc bboard_search_engine_bait_top {conn ignore} {
set string_to_return "
the robots.txt format is idiotic
the robots.txt format is idiotic
so I had to create this program
by Philip Greenspun
There is a nice little bboard system on this
server. Because it was not programmed by a loser, the postings
are stored in a relational database. Consequently the only way for
the data to get out to the Web is via a CGI or API script. Web
indexing robots try not to invoke scripts so they turn back when they
see a URL like /bboard/fetch-msg.tcl?msg_id=000001
.
There is
a standard (the /robots.txt file) by which Web publishers are
supposed to be able to communicate with robots. However, the people
who designed this standard are still living in the 1960s where all
information is stored in the Unix file system. So they can't
understand the need for publishers to be able to use the robots.txt
file to tell their robots to look at links containing \"?\" and other
CGI-like syntax.
I could have waited for these guys to read Web Tools Review and move
into the 1970s. But it turned out to be easier to write this program
and link to it as bait for the search engines. It grabs the data out
in what look like static HTML files.
For more on how lame the Web standards are, read
We Have Chosen Shame and Will Get War.
"
set db [ns_db gethandle]
set selection [ns_db select $db "select msg_id, sort_key from bboard order by sort_key"]
while {[ns_db getrow $db $selection]} {
set_variables_after_query
append string_to_return "- message $msg_id\n"
}
append string_to_return "
philg@mit.edu
"
ns_return $conn 200 text/html $string_to_return
}
proc bboard_search_engine_bait_fetch_one {conn ignore} {
regexp {/bboard/seb/(.*)\.html$} [ns_conn url $conn] match this_msg_id
set db [ns_db gethandle]
if [catch {set selection [ns_db 1row $db "select posting_time::date as posting_date,* from bboard where msg_id = '$this_msg_id'"]} errmsg] {
# couldn't find the message in the database
ns_return $conn 200 text/html "
Error Fetching Message $this_msg_id
Error Fetching Message $this_msg_id
We could not find the message you asked for in the database. You
almost surely got here from a search engine like AltaVista. Such
engines can have data that was out of date months ago. Probably the
message suggested by the search engine was deleted because it was
deemed uninteresting by the bboard maintainer.
You might want to look at the current discussions
on this server. You will probably find some of the other postings
relevant.
[bboard_system_owner]
"
return
}
# we found the row in the database
set_variables_after_query
set this_one_line $one_line
# now variables like $message and $topic are defined
if [catch {set selection [ns_db 1row $db "select unique * from bboard_topics where topic='[DoubleApos $topic]'"]} errmsg] {
bboard_return_cannot_find_topic_page $conn
return
}
set_variables_after_query
set contributed_by "Contributed by $name ($email) on [util_IllustraDatetoPrettyDate $posting_date]."
if { [bboard_topic_user_password_p $db $topic] } {
ns_return $conn 200 text/html "
Password Protected
Password Protected
The message requested was from a password-protected discussion in [bboard_system_name]. It is not available to
search engines.
[bboard_system_owner]
"
return
}
ns_return $conn 200 text/html "
$one_line
$one_line
$message
$contributed_by
This is a posting in the
$topic Q&A Forum.
This page was generated specifically
so that search engines would index the content of the forum. You will
find a better user interface by entering through
the standard user pages.
$maintainer_email
"
}