ad_html_to_text (public)
ad_html_to_text [ -maxlen maxlen ] [ -showtags ] [ -no_format ] html
Defined in packages/acs-tcl/tcl/text-html-procs.tcl
Returns a best-guess plain text version of an HTML fragment. Parses the HTML and does some simple formatting. The parser and formatting is pretty stupid, but it's better than nothing.
- Switches:
- -maxlen (optional, defaults to
"70"
)- the line length you want your output wrapped to.
- -showtags (optional, boolean)
- causes any unknown (and uninterpreted) tags to get shown in the output.
- -no_format (optional, boolean)
- causes hyperlink tags not to get listed at the end of the output.
- Parameters:
- html (required)
- Authors:
- Lars Pind <lars@pinds.com>
- Aaron Swartz <aaron@swartzfam.com>
- Created:
- 19 July 2000
- Partial Call Graph (max 5 caller/called nodes):
- Testcases:
- html_to_text, ad_html_to_text_bold, ad_html_to_text_anchor, ad_html_to_text_image, ad_html_to_text_clipped_link, text_to_html
Source code: set output(text) {} set output(linelen) 0 set output(maxlen) $maxlen set output(pre) 0 set output(p) 0 set output(br) 0 set output(space) 0 set output(blockquote) 0 set length [string length $html] set last_tag_end 0 # For showing the URL of links. set href_urls [list] set href_stack [list] for { set i [string first < $html] } { $i != -1 } { set i [string first < $html $i] } { # append everything up to and not including the tag-opening < ad_html_to_text_put_text output [string range $html $last_tag_end $i-1] # Check that: # - we're not past the end of the string # - and that the tag starts with either # - alpha or # - a slash, and then alpha # Otherwise, it's probably just a lone < character if { $i >= $length - 1 || (![string is alpha [string index $html $i+1]] && [string index $html $i+1] ne "!" && ("/" ne [string index $html $i+1] || ![string is alpha [string index $html $i+2]])) } { # Output the < and continue with next character ad_html_to_text_put_text output "<" set last_tag_end [incr i] continue } elseif {[string match "!--*" [string range $html $i+1 end]]} { # Handle HTML comments, I can't believe no one noticed # this before. This code maybe not be elegant but it # works. # find the closing comment tag. set comment_idx [string first "-->" $html $i] if {$comment_idx == -1} { # no comment close, escape set last_tag_end $i set i $comment_idx break } set i [expr {$comment_idx + 3}] set last_tag_end $i continue } # we're inside a tag now. Find the end of it # make i point to the char after the < incr i set tag_start $i set count 0 while 1 { if {[incr count] > 3000 } { # JCD: the programming bug is that an unmatched < # in the input runs off forever looking for its # closing > and in some long text like program # listings you can have lots of quotes before you # find that > error "There appears to be a programming bug in ad_html_to_text: We've entered an infinite loop." } # Find the positions of the first quote, apostrophe and greater-than sign. set quote_idx [string first \" $html $i] set apostrophe_idx [string first ' $html $i] set gt_idx [string first > $html $i] # If there is no greater-than sign, then the tag isn't closed. if { $gt_idx == -1 } { set i $length break } # Find the first of the quote and the apostrophe if { $apostrophe_idx == -1 } { set string_delimiter_idx $quote_idx } else { if { $quote_idx == -1 } { set string_delimiter_idx $apostrophe_idx } else { if { $apostrophe_idx < $quote_idx } { set string_delimiter_idx $apostrophe_idx } else { set string_delimiter_idx $quote_idx } } } set string_delimiter [string index $html $string_delimiter_idx] # If the greater than sign appears before any of the # string delimiters, we've found the tag end. if { $gt_idx < $string_delimiter_idx || $string_delimiter_idx == -1 } { # we found the tag end set i $gt_idx break } # Otherwise, we'll have to skip past the ending string delimiter set i [string first $string_delimiter $html [incr string_delimiter_idx]] if { $i == -1 } { # Missing string end delimiter set i $length break } incr i } set full_tag [string range $html $tag_start $i-1] if { ![regexp {^(/?)([^\s]+)[\s]*(\s.*)?$} $full_tag match slash tagname attributes] } { # A malformed tag -- just delete it } else { # Reset/create attribute array unset -nocomplain attribute_array # Parse the attributes ad_parse_html_attributes -attribute_array attribute_array $attributes switch -- [string tolower $tagname] { p - ul - ol - table { set output(p) 1 } br { ad_html_to_text_put_newline output } tr - td - th { set output(br) 1 } h1 - h2 - h3 - h4 - h5 - h6 { set output(p) 1 if { $slash eq "" } { ad_html_to_text_put_text output [string repeat "*" [string index $tagname 1]] } } li { set output(br) 1 if { $slash eq "" } { ad_html_to_text_put_text output "- " } } strong - b { ad_html_to_text_put_text output "*" } em - i - cite - u { ad_html_to_text_put_text output "_" } a { if { !$no_format_p } { if { $slash eq ""} { if { [info exists attribute_array(href)] && [string index $attribute_array(href) 0] ni {"#" ""} } { if { [info exists attribute_array(title)] } { set title ": '$attribute_array(title)'" } else { set title "" } set href_no [expr {[llength $href_urls] + 1}] lappend href_urls "\[$href_no\] $attribute_array(href) " lappend href_stack "\[$href_no$title\]" } elseif { [info exists attribute_array(title)] } { lappend href_stack "\[$attribute_array(title)\]" } else { lappend href_stack {} } } else { if { [llength $href_stack] > 0 } { if { [lindex $href_stack end] ne "" } { ad_html_to_text_put_text output " [lindex $href_stack end]" } set href_stack [lreplace $href_stack end end] } } } } pre { set output(p) 1 if { $slash eq "" } { incr output(pre) } else { incr output(pre) -1 } } blockquote { set output(p) 1 if { $slash eq "" } { incr output(blockquote) incr output(maxlen) -4 } else { incr output(blockquote) -1 incr output(maxlen) 4 } } hr { set output(p) 1 ad_html_to_text_put_text output [string repeat "-" $output(maxlen)] set output(p) 1 } q { ad_html_to_text_put_text output \" } img { if { $slash eq "" && !$no_format_p } { set img_info {} if { [info exists attribute_array(alt)] } { lappend img_info "'$attribute_array(alt)'" } if { [info exists attribute_array(src)] } { if {[string match "data:*" $attribute_array(src)]} { lappend img_info "data:..." } else { lappend img_info $attribute_array(src) } } if { [llength $img_info] == 0 } { ad_html_to_text_put_text output {[IMAGE]} } else { ad_html_to_text_put_text output "\[IMAGE: [join $img_info " "]\]" } } } default { # Other tag if { $showtags_p } { ad_html_to_text_put_text output "<$slash$tagname$attributes>" } } } } # set end of last tag to the character following the > set last_tag_end [incr i] } # append everything after the last tag ad_html_to_text_put_text output [string range $html $last_tag_end end] # Close any unclosed tags set output(pre) 0 while { $output(blockquote) > 0 } { incr output(blockquote) -1 incr output(maxlen) 4 } # write out URLs, if necessary: if { [llength $href_urls] > 0 } { append output(text) "\n\n[join $href_urls "\n"]" } #--- # conversion like in ad_text_to_html # 2006/09/12 set myChars { ª º À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö Ø Ù Ú Û Ü Ý Þ ß à á â ã ä å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ö ø ù ú û ü ý þ ÿ ¿ } set myHTML { ª º À Á Â Ã Ä Å &Aelig; Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö Ø Ù Ú Û Ü Ý Þ ß à á â ã ä å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ö ø ù ú û ü ý þ ÿ ¿ } set map {} foreach ch $myChars entity $myHTML { lappend map $entity $ch } return [string map $map $output(text)]XQL Not present: Generic, PostgreSQL, Oracle