# **** note that this file should be placed in your AOLserver's private # Tcl directory if you want to call parse_all from a regular .tcl page # (Alternatively, you can cut and paste this entire file into the top of # the .tcl page that needs to use it.) # This defines an XML parser for a particular DTD, that of quotations. # An effort is made to keep this as general as possible for later extension # to a generalized XML parser. # # Ben Adida (ben@mit.edu), 12/29/1998 # # First we define a data abstraction of a parsed object # so that we can easily work with the parsing of a tag, one by one # This data abstraction packages up a tag, the tag's content, and the rest of # string to parse proc make_parsed_object {tag content rest} { return [list $tag $content $rest] } proc parsed_object_tag {obj} { return [lindex $obj 0] } proc parsed_object_content {obj} { return [lindex $obj 1] } proc parsed_object_rest {obj} { return [lindex $obj 2] } # We then define a procedure that performs the simple parsing of the next tag in # the XML string. This procedure will effectively look for the first "open" tag, # find the corresponding "close" tag, ignore what came before the open tag, parse out # what is between the open and close tag, and package the tag, tag content, and rest into # a parsed object. # # For efficiency and greediness reasons, we don't use regexps. proc simple_parse {str} { set first_open_bracket [string first "<" $str] set first_close_bracket [string first ">" $str] # If we have malformed XML, we return 0 if {($first_close_bracket < $first_open_bracket) || ($first_close_bracket == -1) || ($first_open_bracket == -1)} { # ns_log Notice "simple parse malformed XML 1, $first_open_bracket, $first_close_bracket, " return 0 } # Get the tag set tag [string range $str [expr $first_open_bracket + 1] [expr $first_close_bracket - 1]] # Find the first closing of that tag # FIX NEEDED in case the close of the tag is not same case as start tag. (ben@mit.edu) set close_tag_pos [string first $str] set tag [string tolower $tag] # There's a weird case to take care of here if the close tag appears first, before # any open tag.... (FIX NEEDED, ben@mit.edu) set content [string range $str [expr $first_close_bracket + 1] [expr $close_tag_pos - 1]] # The end of the close tag is (n+3) characters after the start, where # n is the length of the tag name, and 4 is justified by the "" characters # and one more for starting after the close character set end_close_tag_pos [expr $close_tag_pos + 4 + [string length $tag]] set rest [string range $str $end_close_tag_pos end] return [make_parsed_object $tag $content $rest] } # Now we write the DTD-specific parsing # This should be generalized by having a DTD parser that automatically creates these # parse_ procs (oh so very much like Scheme lambda expression, it makes you want to scream) # and then dispatches on each tag. But that's for another day # This procedure looks for a quotations XML object # to parse. THIS IS THE MAIN PROCEDURE TO CALL! proc parse_all {str} { set one_parse [simple_parse $str] # ns_log Notice "first parse" # If we don't have the right tag here, abort. if {[parsed_object_tag $one_parse] != "quotations"} { # ns_log Notice "not a quotations! [parsed_object_tag $one_parse]" return 0 } # Note that we don't care about the "rest" here return [parse_quotations [parsed_object_content $one_parse]] } # A quotations is a bunch of onequotes inside an open quotation / close quotation set of # tags. We will keep using the parsed_object abstraction, varying the data type of # the content position.... (we're almost in Scheme first-class citizen heaven) proc parse_quotations {str} { set list_of_onequotes [list] # Loop until we break while 1 { set one_parse [simple_parse $str] # ns_log Notice "parsing one onequotes" # If nothing comes out, we're done if {$one_parse == 0} { return [list quotations $list_of_onequotes] } # If something other than a onequote comes out, we have an error if {[parsed_object_tag $one_parse] != "onequote"} { return 0 } # Otherwise, parse the onequote! # We might want to check for malformed elements... depends how harsh # we want to be if one of the onequotes is broken! lappend list_of_onequotes [parse_onequote [parsed_object_content $one_parse]] set str [parsed_object_rest $one_parse] } } # This procedure parses a onequote. We could here have a parse_PCDATA # procedure that is called each time, but that is useless here, and # we want to make sure that we have a way of distinguishing between errors # and the text 0... we yearn for the NULL. proc parse_onequote {str} { ### QUOTATION_ID ### set one_parse [simple_parse $str] # Check that we have a quotation_id if {($one_parse == 0) || ([parsed_object_tag $one_parse] != "quotation_id")} { # ns_log Notice "no quotation_id" return 0 } set quotation_id [parsed_object_content $one_parse] set str [parsed_object_rest $one_parse] ### INSERTION_DATE ### set one_parse [simple_parse $str] # Check that we have an insertion_date if {($one_parse == 0) || ([parsed_object_tag $one_parse] != "insertion_date")} { # ns_log Notice "no insertion_date" return 0 } set insertion_date [parsed_object_content $one_parse] set str [parsed_object_rest $one_parse] ### AUTHOR_NAME ### set one_parse [simple_parse $str] # Check that we have an author name if {($one_parse == 0) || ([parsed_object_tag $one_parse] != "author_name")} { # ns_log Notice "no author_name!" return 0 } set author_name [parsed_object_content $one_parse] set str [parsed_object_rest $one_parse] ### CATEGORY ### set one_parse [simple_parse $str] # Check that we have a category if {($one_parse == 0) || ([parsed_object_tag $one_parse] != "category")} { # ns_log Notice "no category!" return 0 } set category [parsed_object_content $one_parse] set str [parsed_object_rest $one_parse] ### QUOTE ### set one_parse [simple_parse $str] # Check that we have a quote if {($one_parse == 0) || ([parsed_object_tag $one_parse] != "quote")} { return 0 } set quote [parsed_object_content $one_parse] set str [parsed_object_rest $one_parse] # Now put this all together in an NS_SET # set return_set [ns_set create] # ns_set put $return_set quotation_id $quotation_id # ns_set put $return_set insertion_date $insertion_date # ns_set put $return_set author_name $author_name # ns_set put $return_set category $category # ns_set put $return_set quote $quote return [list onequote \ [list quotation_id $quotation_id] \ [list insertion_date $insertion_date] \ [list author_name $author_name] \ [list category $category] \ [list quote $quote]] }