ad_html_to

ad_html_to_text (public)

 ad_html_to_text [ -maxlen maxlen ] [ -showtags ] [ -no_format ] html

Defined in packages/acs-tcl/tcl/text-html-procs.tcl

Returns a best-guess plain text version of an HTML fragment. Parses the HTML and does some simple formatting. The parser and formatting is pretty stupid, but it's better than nothing.

Switches:

-maxlen (optional, defaults to "70"): the line length you want your output wrapped to.
-showtags (optional, boolean): causes any unknown (and uninterpreted) tags to get shown in the output.
-no_format (optional, boolean): causes hyperlink tags not to get listed at the end of the output.

Parameters:

html (required)

Authors:

Lars Pind <lars@pinds.com>

Aaron Swartz <aaron@swartzfam.com>

Created:

19 July 2000

Partial Call Graph (max 5 caller/called nodes):

Testcases:

html_to_text, ad_html_to_text_bold, ad_html_to_text_anchor, ad_html_to_text_image, ad_html_to_text_clipped_link, text_to_html

Source code:

    set output(text) {}
    set output(linelen) 0
    set output(maxlen) $maxlen
    set output(pre) 0
    set output(p) 0
    set output(br) 0
    set output(space) 0
    set output(blockquote) 0

    set length [string length $html]
    set last_tag_end 0

    # For showing the URL of links.
    set href_urls [list]
    set href_stack [list]

    for { set i [string first < $html] } { $i != -1 } { set i [string first < $html $i] } {
        # append everything up to and not including the tag-opening <
        ad_html_to_text_put_text output [string range $html $last_tag_end $i-1]

        # Check that:
        #  - we're not past the end of the string
        #  - and that the tag starts with either
        #     - alpha or
        #     - a slash, and then alpha
        # Otherwise, it's probably just a lone < character
        if { $i >= $length - 1 ||
             (![string is alpha [string index $html $i+1]]
              && [string index $html $i+1] ne "!"
              && ("/" ne [string index $html $i+1] ||
                  ![string is alpha [string index $html $i+2]]))
         } {
            # Output the < and continue with next character
            ad_html_to_text_put_text output "<"
            set last_tag_end [incr i]
            continue
        } elseif {[string match "!--*" [string range $html $i+1 end]]} {
            # Handle HTML comments, I can't believe no one noticed
            # this before.  This code maybe not be elegant but it
            # works.

            # find the closing comment tag.
            set comment_idx [string first "-->" $html $i]
            if {$comment_idx == -1} {
                # no comment close, escape
                set last_tag_end $i
                set i $comment_idx
                break
            }
            set i [expr {$comment_idx + 3}]
            set last_tag_end $i

            continue
        }
        # we're inside a tag now. Find the end of it

        # make i point to the char after the <
        incr i
        set tag_start $i

        set count 0
        while 1 {
            if {[incr count] > 3000 } {
                # JCD: the programming bug is that an unmatched <
                # in the input runs off forever looking for its
                # closing > and in some long text like program
                # listings you can have lots of quotes before you
                # find that >
                error "There appears to be a programming bug in ad_html_to_text: We've entered an infinite loop."
            }
            # Find the positions of the first quote, apostrophe and greater-than sign.
            set quote_idx [string first \" $html $i]
            set apostrophe_idx [string first ' $html $i]
            set gt_idx [string first > $html $i]

            # If there is no greater-than sign, then the tag isn't closed.
            if { $gt_idx == -1 } {
                set i $length
                break
            }

            # Find the first of the quote and the apostrophe
            if { $apostrophe_idx == -1 } {
                set string_delimiter_idx $quote_idx
            } else {
                if { $quote_idx == -1 } {
                    set string_delimiter_idx $apostrophe_idx
                } else {
                    if { $apostrophe_idx < $quote_idx } {
                        set string_delimiter_idx $apostrophe_idx
                    } else {
                        set string_delimiter_idx $quote_idx
                    }
                }
            }
            set string_delimiter [string index $html $string_delimiter_idx]

            # If the greater than sign appears before any of the
            # string delimiters, we've found the tag end.
            if { $gt_idx < $string_delimiter_idx || $string_delimiter_idx == -1 } {
                # we found the tag end
                set i $gt_idx
                break
            }

            # Otherwise, we'll have to skip past the ending string delimiter
            set i [string first $string_delimiter $html [incr string_delimiter_idx]]
            if { $i == -1 } {
                # Missing string end delimiter
                set i $length
                break
            }
            incr i
        }

        set full_tag [string range $html $tag_start $i-1]

        if { ![regexp {^(/?)([^\s]+)[\s]*(\s.*)?$} $full_tag match slash tagname attributes] } {
            # A malformed tag -- just delete it
        } else {

            # Reset/create attribute array
            unset -nocomplain attribute_array

            # Parse the attributes
            ad_parse_html_attributes -attribute_array attribute_array $attributes

            switch -- [string tolower $tagname] {
                p - ul - ol - table {
                    set output(p) 1
                }
                br {
                    ad_html_to_text_put_newline output
                }
                tr - td - th {
                    set output(br) 1
                }
                h1 - h2 - h3 - h4 - h5 - h6 {
                    set output(p) 1
                    if { $slash eq "" } {
                        ad_html_to_text_put_text output [string repeat "*" [string index $tagname 1]]
                    }
                }
                li {
                    set output(br) 1
                    if { $slash eq "" } {
                        ad_html_to_text_put_text output "- "
                    }
                }
                strong - b {
                    ad_html_to_text_put_text output "*"
                }
                em - i - cite - u {
                    ad_html_to_text_put_text output "_"
                }
                a {
                    if { !$no_format_p } {
                        if { $slash eq ""} {
                            if { [info exists attribute_array(href)]
                                 && [string index $attribute_array(href) 0] ni {"#" ""}
                             } {
                                if { [info exists attribute_array(title)] } {
                                    set title ": '$attribute_array(title)'"
                                } else {
                                    set title ""
                                }
                                set href_no [expr {[llength $href_urls] + 1}]
                                lappend href_urls "\[$href_no\] $attribute_array(href) "
                                lappend href_stack "\[$href_no$title\]"
                            } elseif { [info exists attribute_array(title)] } {
                                lappend href_stack "\[$attribute_array(title)\]"
                            } else {
                                lappend href_stack {}
                            }
                        } else {
                            if { [llength $href_stack] > 0 } {
                                if { [lindex $href_stack end] ne "" } {
                                    ad_html_to_text_put_text output " [lindex $href_stack end]"
                                }
                                set href_stack [lreplace $href_stack end end]
                            }
                        }
                    }
                }
                pre {
                    set output(p) 1
                    if { $slash eq "" } {
                        incr output(pre)
                    } else {
                        incr output(pre) -1
                    }
                }
                blockquote {
                    set output(p) 1
                    if { $slash eq "" } {
                        incr output(blockquote)
                        incr output(maxlen) -4
                    } else {
                        incr output(blockquote) -1
                        incr output(maxlen) 4
                    }
                }
                hr {
                    set output(p) 1
                    ad_html_to_text_put_text output [string repeat "-" $output(maxlen)]
                    set output(p) 1
                }
                q {
                    ad_html_to_text_put_text output \"
                }
                img {
                    if { $slash eq "" && !$no_format_p } {
                        set img_info {}
                        if { [info exists attribute_array(alt)] } {
                            lappend img_info "'$attribute_array(alt)'"
                        }
                        if { [info exists attribute_array(src)] } {
                            if {[string match "data:*" $attribute_array(src)]} {
                                lappend img_info "data:..."
                            } else {
                                lappend img_info $attribute_array(src)
                            }
                        }
                        if { [llength $img_info] == 0 } {
                            ad_html_to_text_put_text output {[IMAGE]}
                        } else {
                            ad_html_to_text_put_text output "\[IMAGE: [join $img_info " "]\]"
                        }
                    }
                }
                default {
                    # Other tag
                    if { $showtags_p } {
                        ad_html_to_text_put_text output "&lt;$slash$tagname$attributes&gt;"
                    }
                }
            }
        }

        # set end of last tag to the character following the >
        set last_tag_end [incr i]
    }
    # append everything after the last tag
    ad_html_to_text_put_text output [string range $html $last_tag_end end]

    # Close any unclosed tags
    set output(pre) 0
    while { $output(blockquote) > 0 } {
        incr output(blockquote) -1
        incr output(maxlen) 4
    }

    # write out URLs, if necessary:
    if { [llength $href_urls] > 0 } {
        append output(text) "\n\n[join $href_urls "\n"]"
    }

    #---
    # conversion like in ad_text_to_html
    # 2006/09/12
    set  myChars  {
        ª º À Á Â Ã Ä Å Æ Ç
        È É Ê Ë Ì Í Î Ï Ð Ñ
        Ò Ó Ô Õ Ö Ø Ù Ú Û Ü
        Ý Þ ß à á â ã ä å æ
        ç è é ê ë ì í î ï ð
        ñ ò ó ô õ ö ø ù ú û
        ü ý þ ÿ ¿
    }

    set  myHTML  {
        &ordf; &ordm; &Agrave; &Aacute; &Acirc; &Atilde; &Auml; &Aring; &Aelig; &Ccedil;
        &Egrave; &Eacute; &Ecirc; &Euml; &Igrave; &Iacute; &Icirc; &Iuml; &ETH; &Ntilde;
        &Ograve; &Oacute; &Ocirc; &Otilde; &Ouml; &Oslash; &Ugrave; &Uacute; &Ucirc; &Uuml;
        &Yacute; &THORN; &szlig; &agrave; &aacute; &acirc; &atilde; &auml; &aring; &aelig;
        &ccedil; &egrave; &eacute; &ecirc; &euml; &igrave; &iacute; &icirc; &iuml; &eth;
        &ntilde; &ograve; &oacute; &ocirc; &otilde; &ouml; &oslash; &ugrave; &uacute; &ucirc;
        &uuml; &yacute; &thorn; &yuml; &iquest;
    }

    set map {}
    foreach ch $myChars entity $myHTML {
        lappend map $entity $ch
    }

    return [string map $map $output(text)]

XQL Not present:

Generic, PostgreSQL, Oracle

[ hide source ] | [ make this the default ]