parse-all.tcl

# **** note that this file should be placed in your AOLserver's private
# Tcl directory if you want to call parse_all from a regular .tcl page
# (Alternatively, you can cut and paste this entire file into the top of
#  the .tcl page that needs to use it.)

# This defines an XML parser for a particular DTD, that of quotations.
# An effort is made to keep this as general as possible for later extension
# to a generalized XML parser.
#
# Ben Adida (ben@mit.edu), 12/29/1998 
#

# First we define a data abstraction of a parsed object
# so that we can easily work with the parsing of a tag, one by one
# This data abstraction packages up a tag, the tag's content, and the rest of
# string to parse

proc make_parsed_object {tag content rest} {
    return [list $tag $content $rest]
}

proc parsed_object_tag {obj} {
    return [lindex $obj 0]
}

proc parsed_object_content {obj} {
    return [lindex $obj 1]
}

proc parsed_object_rest {obj} {
    return [lindex $obj 2]
}

# We then define a procedure that performs the simple parsing of the next tag in
# the XML string. This procedure will effectively look for the first "open" tag,
# find the corresponding "close" tag, ignore what came before the open tag, parse out 
# what is between the open and close tag, and package the tag, tag content, and rest into
# a parsed object.
#
# For efficiency and greediness reasons, we don't use regexps.

proc simple_parse {str} {
    set first_open_bracket [string first "<" $str]
    set first_close_bracket [string first ">" $str]

    # If we have malformed XML, we return 0
    if {($first_close_bracket < $first_open_bracket) || ($first_close_bracket == -1) || ($first_open_bracket == -1)} {
        # ns_log Notice "simple parse malformed XML 1, $first_open_bracket, $first_close_bracket, "
        return 0
    }

    # Get the tag
    set tag [string range $str [expr $first_open_bracket + 1] [expr $first_close_bracket - 1]]

    # Find the first closing of that tag
    # FIX NEEDED in case the close of the tag is not same case as start tag. (ben@mit.edu)
    set close_tag_pos [string first  $str]
    set tag [string tolower $tag]

    # There's a weird case to take care of here if the close tag appears first, before
    # any open tag.... (FIX NEEDED, ben@mit.edu)

    set content [string range $str [expr $first_close_bracket + 1] [expr $close_tag_pos - 1]]

    # The end of the close tag is (n+3) characters after the start, where
    # n is the length of the tag name, and 4 is justified by the "" characters
    # and one more for starting after the close character
    set end_close_tag_pos [expr $close_tag_pos + 4 + [string length $tag]]
    
    set rest [string range $str $end_close_tag_pos end]

    return [make_parsed_object $tag $content $rest]
}

# Now we write the DTD-specific parsing
# This should be generalized by having a DTD parser that automatically creates these
# parse_ procs (oh so very much like Scheme lambda expression, it makes you want to scream)
# and then dispatches on each tag. But that's for another day

# This procedure looks for a quotations XML object 
# to parse. THIS IS THE MAIN PROCEDURE TO CALL!
proc parse_all {str} {
    
    set one_parse [simple_parse $str]
    # ns_log Notice "first parse"

    # If we don't have the right tag here, abort.
    if {[parsed_object_tag $one_parse] != "quotations"} {
        # ns_log Notice "not a quotations! [parsed_object_tag $one_parse]"
        return 0
    }

    # Note that we don't care about the "rest" here

    return [parse_quotations [parsed_object_content $one_parse]]
}

# A quotations is a bunch of onequotes inside an open quotation / close quotation set of
# tags. We will keep using the parsed_object abstraction, varying the data type of
# the content position.... (we're almost in Scheme first-class citizen heaven)
proc parse_quotations {str} {
    
    set list_of_onequotes [list]

    # Loop until we break
    while 1 {
        set one_parse [simple_parse $str]
        # ns_log Notice "parsing one onequotes"

        # If nothing comes out, we're done
        if {$one_parse == 0} {
            return [list quotations $list_of_onequotes]
        }

        # If something other than a onequote comes out, we have an error
        if {[parsed_object_tag $one_parse] != "onequote"} {
            return 0
        }

        # Otherwise, parse the onequote!
        # We might want to check for malformed elements... depends how harsh
        # we want to be if one of the onequotes is broken!
        lappend list_of_onequotes [parse_onequote [parsed_object_content $one_parse]]
        set str [parsed_object_rest $one_parse]
    }
}

# This procedure parses a onequote. We could here have a parse_PCDATA
# procedure that is called each time, but that is useless here, and
# we want to make sure that we have a way of distinguishing between errors
# and the text 0... we yearn for the NULL.
proc parse_onequote {str} {
    
    ### QUOTATION_ID
    ###
    set one_parse [simple_parse $str]
    
    # Check that we have a quotation_id
    if {($one_parse == 0) || ([parsed_object_tag $one_parse] != "quotation_id")} {
        # ns_log Notice "no quotation_id"
        return 0
    }

    set quotation_id [parsed_object_content $one_parse]
    set str [parsed_object_rest $one_parse]

    ### INSERTION_DATE
    ###
    set one_parse [simple_parse $str]
    
    # Check that we have an insertion_date
    if {($one_parse == 0) || ([parsed_object_tag $one_parse] != "insertion_date")} {
        # ns_log Notice "no insertion_date"
        return 0
    }

    set insertion_date [parsed_object_content $one_parse]
    set str [parsed_object_rest $one_parse]

    ### AUTHOR_NAME
    ### 
    set one_parse [simple_parse $str]
    
    # Check that we have an author name
    if {($one_parse == 0) || ([parsed_object_tag $one_parse] != "author_name")} {
        # ns_log Notice "no author_name!"
        return 0
    }

    set author_name [parsed_object_content $one_parse]
    set str [parsed_object_rest $one_parse]

    ### CATEGORY
    ###
    set one_parse [simple_parse $str]
    
    # Check that we have a category
    if {($one_parse == 0) || ([parsed_object_tag $one_parse] != "category")} {
        # ns_log Notice "no category!"
        return 0
    }

    set category [parsed_object_content $one_parse]
    set str [parsed_object_rest $one_parse]

    ### QUOTE
    ###
    set one_parse [simple_parse $str]
    
    # Check that we have a quote
    if {($one_parse == 0) || ([parsed_object_tag $one_parse] != "quote")} {
        return 0
    }

    set quote [parsed_object_content $one_parse]
    set str [parsed_object_rest $one_parse]

    # Now put this all together in an NS_SET
#      set return_set [ns_set create]
#      ns_set put $return_set quotation_id $quotation_id
#      ns_set put $return_set insertion_date $insertion_date
#      ns_set put $return_set author_name $author_name
#      ns_set put $return_set category $category
#      ns_set put $return_set quote $quote

    return [list onequote \
            [list quotation_id $quotation_id] \
            [list insertion_date $insertion_date] \
            [list author_name $author_name] \
            [list category $category] \
            [list quote $quote]]

}