ad_html_to_text (public)

 ad_html_to_text [ -maxlen maxlen ] [ -showtags ] [ -no_format ] html

Defined in packages/acs-tcl/tcl/text-html-procs.tcl

Returns a best-guess plain text version of an HTML fragment. Parses the HTML and does some simple formatting. The parser and formatting is pretty stupid, but it's better than nothing.

Switches:
-maxlen
(defaults to "70") (optional)
the line length you want your output wrapped to.
-showtags
(boolean) (optional)
causes any unknown (and uninterpreted) tags to get shown in the output.
-no_format
(boolean) (optional)
causes hyperlink tags not to get listed at the end of the output.
Parameters:
html
Authors:
Lars Pind <lars@pinds.com>
Aaron Swartz <aaron@swartzfam.com>
Created:
19 July 2000

Partial Call Graph (max 5 caller/called nodes):
%3 test_ad_html_to_text_anchor ad_html_to_text_anchor (test acs-tcl) ad_html_to_text ad_html_to_text test_ad_html_to_text_anchor->ad_html_to_text test_ad_html_to_text_bold ad_html_to_text_bold (test acs-tcl) test_ad_html_to_text_bold->ad_html_to_text test_ad_html_to_text_clipped_link ad_html_to_text_clipped_link (test acs-tcl) test_ad_html_to_text_clipped_link->ad_html_to_text test_ad_html_to_text_image ad_html_to_text_image (test acs-tcl) test_ad_html_to_text_image->ad_html_to_text test_html_to_text html_to_text (test acs-tcl) test_html_to_text->ad_html_to_text ad_html_to_text_put_newline ad_html_to_text_put_newline (private) ad_html_to_text->ad_html_to_text_put_newline ad_html_to_text_put_text ad_html_to_text_put_text (private) ad_html_to_text->ad_html_to_text_put_text ad_parse_html_attributes ad_parse_html_attributes (public) ad_html_to_text->ad_parse_html_attributes acs_admin::check_expired_certificates acs_admin::check_expired_certificates (private) acs_admin::check_expired_certificates->ad_html_to_text acs_mail_lite::utils::build_body acs_mail_lite::utils::build_body (private) acs_mail_lite::utils::build_body->ad_html_to_text ad_enhanced_text_to_plain_text ad_enhanced_text_to_plain_text (public) ad_enhanced_text_to_plain_text->ad_html_to_text ad_html_text_convert ad_html_text_convert (public) ad_html_text_convert->ad_html_to_text ad_parse_incoming_email ad_parse_incoming_email (public) ad_parse_incoming_email->ad_html_to_text

Testcases:
html_to_text, ad_html_to_text_bold, ad_html_to_text_anchor, ad_html_to_text_image, ad_html_to_text_clipped_link, text_to_html
Source code:
        set output(text) {}
        set output(linelen) 0
        set output(maxlen) $maxlen
        set output(pre) 0
        set output(p) 0
        set output(br) 0
        set output(space) 0
        set output(blockquote) 0

        set length [string length $html]
        set last_tag_end 0

        # For showing the URL of links.
        set href_urls [list]
        set href_stack [list]

        for { set i [string first < $html] } { $i != -1 } { set i [string first < $html $i] } {
            # append everything up to and not including the tag-opening <
            ad_html_to_text_put_text output [string range $html $last_tag_end $i-1]

            # Check that:
            #  - we're not past the end of the string
            #  - and that the tag starts with either
            #     - alpha or
            #     - a slash, and then alpha
            # Otherwise, it's probably just a lone < character
            if { $i >= $length - 1 ||
                 (![string is alpha [string index $html $i+1]]
                  && [string index $html $i+1] ne "!"
                  && ("/" ne [string index $html $i+1] ||
                      ![string is alpha [string index $html $i+2]]))
             } {
                # Output the < and continue with next character
                ad_html_to_text_put_text output "<"
                set last_tag_end [incr i]
                continue
            } elseif {[string match "!--*" [string range $html $i+1 end]]} {
                # Handle HTML comments, I can't believe no one noticed
                # this before.  This code maybe not be elegant but it
                # works.

                # find the closing comment tag.
                set comment_idx [string first "-->" $html $i]
                if {$comment_idx == -1} {
                    # no comment close, escape
                    set last_tag_end $i
                    set i $comment_idx
                    break
                }
                set i [expr {$comment_idx + 3}]
                set last_tag_end $i

                continue
            }
            # we're inside a tag now. Find the end of it

            # make i point to the char after the <
            incr i
            set tag_start $i

            set count 0
            while 1 {
                if {[incr count] > 3000 } {
                    # JCD: the programming bug is that an unmatched <
                    # in the input runs off forever looking for its
                    # closing > and in some long text like program
                    # listings you can have lots of quotes before you
                    # find that >
                    error "There appears to be a programming bug in ad_html_to_text: We've entered an infinite loop."
                }
                # Find the positions of the first quote, apostrophe and greater-than sign.
                set quote_idx [string first \" $html $i]
                set apostrophe_idx [string first ' $html $i]
                set gt_idx [string first > $html $i]

                # If there is no greater-than sign, then the tag isn't closed.
                if { $gt_idx == -1 } {
                    set i $length
                    break
                }

                # Find the first of the quote and the apostrophe
                if { $apostrophe_idx == -1 } {
                    set string_delimiter_idx $quote_idx
                } else {
                    if { $quote_idx == -1 } {
                        set string_delimiter_idx $apostrophe_idx
                    } else {
                        if { $apostrophe_idx < $quote_idx } {
                            set string_delimiter_idx $apostrophe_idx
                        } else {
                            set string_delimiter_idx $quote_idx
                        }
                    }
                }
                set string_delimiter [string index $html $string_delimiter_idx]

                # If the greater than sign appears before any of the
                # string delimiters, we've found the tag end.
                if { $gt_idx < $string_delimiter_idx || $string_delimiter_idx == -1 } {
                    # we found the tag end
                    set i $gt_idx
                    break
                }

                # Otherwise, we'll have to skip past the ending string delimiter
                set i [string first $string_delimiter $html [incr string_delimiter_idx]]
                if { $i == -1 } {
                    # Missing string end delimiter
                    set i $length
                    break
                }
                incr i
            }

            set full_tag [string range $html $tag_start $i-1]

            if { ![regexp {^(/?)([^\s]+)[\s]*(\s.*)?$} $full_tag match slash tagname attributes] } {
                # A malformed tag -- just delete it
            } else {

                # Reset/create attribute array
                array unset attribute_array

                # Parse the attributes
                ad_parse_html_attributes -attribute_array attribute_array $attributes

                switch -- [string tolower $tagname] {
                    p - ul - ol - table {
                        set output(p) 1
                    }
                    br {
                        ad_html_to_text_put_newline output
                    }
                    tr - td - th {
                        set output(br) 1
                    }
                    h1 - h2 - h3 - h4 - h5 - h6 {
                        set output(p) 1
                        if { $slash eq "" } {
                            ad_html_to_text_put_text output [string repeat "*" [string index $tagname 1]]
                        }
                    }
                    li {
                        set output(br) 1
                        if { $slash eq "" } {
                            ad_html_to_text_put_text output "- "
                        }
                    }
                    strong - b {
                        ad_html_to_text_put_text output "*"
                    }
                    em - i - cite - u {
                        ad_html_to_text_put_text output "_"
                    }
                    a {
                        if { !$no_format_p } {
                            if { $slash eq ""} {
                                if { [info exists attribute_array(href)]
                                     && [string index $attribute_array(href) 0] ni {"#" ""}
                                 } {
                                    if { [info exists attribute_array(title)] } {
                                        set title ": '$attribute_array(title)'"
                                    } else {
                                        set title ""
                                    }
                                    set href_no [expr {[llength $href_urls] + 1}]
                                    lappend href_urls "\[$href_no\] $attribute_array(href) "
                                    lappend href_stack "\[$href_no$title\]"
                                } elseif { [info exists attribute_array(title)] } {
                                    lappend href_stack "\[$attribute_array(title)\]"
                                } else {
                                    lappend href_stack {}
                                }
                            } else {
                                if { [llength $href_stack] > 0 } {
                                    if { [lindex $href_stack end] ne "" } {
                                        ad_html_to_text_put_text output " [lindex $href_stack end]"
                                    }
                                    set href_stack [lreplace $href_stack end end]
                                }
                            }
                        }
                    }
                    pre {
                        set output(p) 1
                        if { $slash eq "" } {
                            incr output(pre)
                        } else {
                            incr output(pre) -1
                        }
                    }
                    blockquote {
                        set output(p) 1
                        if { $slash eq "" } {
                            incr output(blockquote)
                            incr output(maxlen) -4
                        } else {
                            incr output(blockquote) -1
                            incr output(maxlen) 4
                        }
                    }
                    hr {
                        set output(p) 1
                        ad_html_to_text_put_text output [string repeat "-" $output(maxlen)]
                        set output(p) 1
                    }
                    q {
                        ad_html_to_text_put_text output \"
                    }
                    img {
                        if { $slash eq "" && !$no_format_p } {
                            set img_info {}
                            if { [info exists attribute_array(alt)] } {
                                lappend img_info "'$attribute_array(alt)'"
                            }
                            if { [info exists attribute_array(src)] } {
                                if {[string match "data:*" $attribute_array(src)]} {
                                    lappend img_info "data:..."
                                } else {
                                    lappend img_info $attribute_array(src)
                                }
                            }
                            if { [llength $img_info] == 0 } {
                                ad_html_to_text_put_text output {[IMAGE]}
                            } else {
                                ad_html_to_text_put_text output "\[IMAGE: [join $img_info " "]\]"
                            }
                        }
                    }
                    default {
                        # Other tag
                        if { $showtags_p } {
                            ad_html_to_text_put_text output "&lt;$slash$tagname$attributes&gt;"
                        }
                    }
                }
            }

            # set end of last tag to the character following the >
            set last_tag_end [incr i]
        }
        # append everything after the last tag
        ad_html_to_text_put_text output [string range $html $last_tag_end end]

        # Close any unclosed tags
        set output(pre) 0
        while { $output(blockquote) > 0 } {
            incr output(blockquote) -1
            incr output(maxlen) 4
        }

        # write out URLs, if necessary:
        if { [llength $href_urls] > 0 } {
            append output(text) "\n\n[join $href_urls "\n"]"
        }

        #---
        # conversion like in ad_text_to_html
        # 2006/09/12
        set  myChars  {
            ª º À Á Â Ã Ä Å Æ Ç
            È É Ê Ë Ì Í Î Ï Ð Ñ
            Ò Ó Ô Õ Ö Ø Ù Ú Û Ü
            Ý Þ ß à á â ã ä å æ
            ç è é ê ë ì í î ï ð
            ñ ò ó ô õ ö ø ù ú û
            ü ý þ ÿ ¿
        }

        set  myHTML  {
            &ordf; &ordm; &Agrave; &Aacute; &Acirc; &Atilde; &Auml; &Aring; &Aelig; &Ccedil;
            &Egrave; &Eacute; &Ecirc; &Euml; &Igrave; &Iacute; &Icirc; &Iuml; &ETH; &Ntilde;
            &Ograve; &Oacute; &Ocirc; &Otilde; &Ouml; &Oslash; &Ugrave; &Uacute; &Ucirc; &Uuml;
            &Yacute; &THORN; &szlig; &agrave; &aacute; &acirc; &atilde; &auml; &aring; &aelig;
            &ccedil; &egrave; &eacute; &ecirc; &euml; &igrave; &iacute; &icirc; &iuml; &eth;
            &ntilde; &ograve; &oacute; &ocirc; &otilde; &ouml; &oslash; &ugrave; &uacute; &ucirc;
            &uuml; &yacute; &thorn; &yuml; &iquest;
        }

        set map {}
        foreach ch $myChars entity $myHTML {
            lappend map $entity $ch
        }

        return [string map $map $output(text)]
XQL Not present:
Generic, PostgreSQL, Oracle
[ hide source ] | [ make this the default ]
Show another procedure: