This procedure is defined in the server but not documented via ad_proc or proc_doc and may be intended as a private interface.
The procedure is defined as:
proc tdom::xmlOpenFileWorker {filename encodingString forSimple forRead} {
variable utf8bom
# This partly (mis-)use the encoding of a channel handed to [dom
# parse -channel ..] as a marker: if the channel encoding is utf-8
# then behind the scene Tcl_Read() is used, otherwise
# Tcl_ReadChars(). This is used for the encodings understood (and
# checked) by the used expat implementation: utf-8 and utf-16 (in
# either byte order).
#
# The -translation auto used used in the fconfigure commands which
# set the encoding isn't strictly necessary in case the parser is
# expat (because it handles that internally) but it is the right
# thing for the simple parser.
set fd [open $filename]
if {$encodingString != {}} {
upvar $encodingString encString
}
# The autodetection of the encoding follows
# XML Recomendation, Appendix F
fconfigure $fd -translation binary
if {![binary scan [read $fd 4] "H8" firstBytes]} {
# very short (< 4 Bytes) file, that means not a well-formed
# XML at all (the shortes possible would be <[a-zA-Z]/>).
# Don't report that here but let the parser do that.
seek $fd 0 start
set encString UTF-8
return $fd
}
# First check for BOM
switch [string range $firstBytes 0 3] {
"feff" {
# feff: UTF-16, big-endian BOM
if {$forSimple || $forRead} {
if {[package vsatisfies [package provide Tcl] 9-]} {
seek $fd 2 start
fconfigure $fd -encoding utf-16be -translation auto
} else {
error "UTF-16be is not supported"
}
} else {
seek $fd 0 start
set encString UTF-16be
fconfigure $fd -encoding utf-8 -translation auto
}
return $fd
}
"fffe" {
# ffef: UTF-16, little-endian BOM
set encString UTF-16le
if {$forSimple || $forRead} {
seek $fd 2 start
if {[package vsatisfies [package provide Tcl] 9-]} {
fconfigure $fd -encoding utf-16le -translation auto
} else {
fconfigure $fd -encoding unicode -translation auto
}
} else {
seek $fd 0 start
fconfigure $fd -encoding utf-8 -translation auto
}
return $fd
}
}
if {$utf8bom} {
# According to the Unicode standard
# (http://www.unicode.org/versions/Unicode5.0.0/ch02.pdf) the
# "[u]se of a BOM is neither required nor recommended for
# UTF-8". Nevertheless such files exits. If the programmer
# explcitely enables this by setting ::tdom::utf8bom to true
# this is handled here.
if {[string range $firstBytes 0 5] eq "efbbbf"} {
set encString UTF-8
seek $fd 3 start
fconfigure $fd -encoding utf-8 -translation auto
return $fd
}
}
# If the entity has a XML Declaration, the first four characters
# must be "<?xm".
switch $firstBytes {
"3c3f786d" {
# UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS,
# EUC, or any other 7-bit, 8-bit, or mixed-width encoding which
# ensures that the characters of ASCII have their normal positions,
# width and values; the actual encoding declaration must be read to
# detect which of these applies, but since all of these encodings
# use the same bit patterns for the ASCII characters, the encoding
# declaration itself be read reliably.
# First 300 bytes should be enough for a XML Declaration
# This is of course not 100 percent bullet-proof.
set head [read $fd 296]
# Try to find the end of the XML Declaration
set closeIndex [string first ">" $head]
if {$closeIndex == -1} {
error "Weird XML data or not XML data at all"
}
seek $fd 0 start
set xmlDeclaration [read $fd [expr {$closeIndex + 5}]]
# extract the encoding information
set pattern {^[^>]+encoding=[\x20\x9\xd\xa]*["']([^ "']+)['"]}
# emacs: "
if {![regexp $pattern $head - encStr]} {
# Probably something like <?xml version="1.0"?>.
# Without encoding declaration this must be UTF-8
set encoding utf-8
set encString UTF-8
} else {
set encoding [IANAEncoding2TclEncoding $encStr]
set encString $encStr
}
}
"0000003c" -
"0000003c" -
"3c000000" -
"00003c00" {
# UCS-4
error "UCS-4 not supported"
}
"003c003f" {
# UTF-16, big-endian, no BOM
if {$forSimple || $forRead} {
if {[package vsatisfies [package provide Tcl] 9-]} {
set encoding utf-16be
} else {
error "UTF-16be is not supported by the simple parser"
}
} else {
set encoding utf-8
}
seek $fd 0 start
set encString UTF-16be
}
"3c003f00" {
# UTF-16, little-endian, no BOM
if {$forSimple || $forRead} {
if {[package vsatisfies [package provide Tcl] 9-]} {
set encoding utf-16le
} else {
set encoding unicode
}
} else {
set encoding utf-8
}
seek $fd 0 start
set encString UTF-16le
}
"4c6fa794" {
# EBCDIC in some flavor
if {[package vsatisfies [package provide Tcl] 9-]} {
seek $fd 0 start
set encoding ebcdic
} else {
error "EBCDIC not supported"
}
}
default {
# UTF-8 without an encoding declaration
seek $fd 0 start
set encoding utf-8
set encString "UTF-8"
}
}
fconfigure $fd -encoding $encoding -translation auto
return $fd
}