Skip to content

Commit

Permalink
Default CharsetReader improvements
Browse files Browse the repository at this point in the history
When nil, the ReadSettings struct's CharsetReader field now causes
the XML decoder to use a "pass-though" charset converter, passing
the reader's data through without modification.

This was already the default behavior when creating a new etree
document with the NewDocument function, but now a default-
constructed ReadSettings struct will result in the same default
CharsetReader behavior.
  • Loading branch information
beevik committed Jul 8, 2024
1 parent 7113fd9 commit e8292cc
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 36 deletions.
51 changes: 22 additions & 29 deletions etree.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,14 @@ var ErrXML = errors.New("etree: invalid XML format")
var cdataPrefix = []byte("<![CDATA[")

// ReadSettings determine the default behavior of the Document's ReadFrom*
// methods.
// functions.
type ReadSettings struct {
// CharsetReader to be passed to standard xml.Decoder. Default: nil.
// CharsetReader, if non-nil, defines a function to generate
// charset-conversion readers, converting from the provided non-UTF-8
// charset into UTF-8. If nil, the ReadFrom* functions will use a
// "pass-through" CharsetReader that performs no conversion on the reader's
// data regardless of the value of the "charset" encoding string. Default:
// nil.
CharsetReader func(charset string, input io.Reader) (io.Reader, error)

// Permissive allows input containing common mistakes such as missing tags
Expand Down Expand Up @@ -72,13 +77,11 @@ type ReadSettings struct {
AutoClose []string
}

// newReadSettings creates a default ReadSettings record.
func newReadSettings() ReadSettings {
return ReadSettings{
CharsetReader: func(label string, input io.Reader) (io.Reader, error) {
return input, nil
},
}
// defaultCharsetReader is used by the xml decoder when the ReadSettings
// CharsetReader value is nil. It behaves as a "pass-through", ignoring
// the requested charset parameter and skipping conversion altogether.
func defaultCharsetReader(charset string, input io.Reader) (io.Reader, error) {
return input, nil
}

// dup creates a duplicate of the ReadSettings object.
Expand All @@ -97,7 +100,7 @@ func (s *ReadSettings) dup() ReadSettings {
}
}

// WriteSettings determine the behavior of the Document's WriteTo* methods.
// WriteSettings determine the behavior of the Document's WriteTo* functions.
type WriteSettings struct {
// CanonicalEndTags forces the production of XML end tags, even for
// elements that have no child elements. Default: false.
Expand All @@ -118,31 +121,20 @@ type WriteSettings struct {
// false.
AttrSingleQuote bool

// UseCRLF causes the document's Indent* methods to use a carriage return
// UseCRLF causes the document's Indent* functions to use a carriage return
// followed by a linefeed ("\r\n") when outputting a newline. If false,
// only a linefeed is used ("\n"). Default: false.
//
// Deprecated: UseCRLF is deprecated. Use IndentSettings.UseCRLF instead.
UseCRLF bool
}

// newWriteSettings creates a default WriteSettings record.
func newWriteSettings() WriteSettings {
return WriteSettings{
CanonicalEndTags: false,
CanonicalText: false,
CanonicalAttrVal: false,
AttrSingleQuote: false,
UseCRLF: false,
}
}

// dup creates a duplicate of the WriteSettings object.
func (s *WriteSettings) dup() WriteSettings {
return *s
}

// IndentSettings determine the behavior of the Document's Indent* methods.
// IndentSettings determine the behavior of the Document's Indent* functions.
type IndentSettings struct {
// Spaces indicates the number of spaces to insert for each level of
// indentation. Set to etree.NoIndent to remove all indentation. Ignored
Expand All @@ -158,7 +150,7 @@ type IndentSettings struct {
// for a newline ("\n"). Default: false.
UseCRLF bool

// PreserveLeafWhitespace causes indent methods to preserve whitespace
// PreserveLeafWhitespace causes indent functions to preserve whitespace
// within XML elements containing only non-CDATA character data. Default:
// false.
PreserveLeafWhitespace bool
Expand Down Expand Up @@ -200,7 +192,7 @@ func getIndentFunc(s *IndentSettings) indentFunc {
}
}

// Writer is the interface that wraps the Write* methods called by each token
// Writer is the interface that wraps the Write* functions called by each token
// type's WriteTo function.
type Writer interface {
io.StringWriter
Expand Down Expand Up @@ -265,7 +257,7 @@ const (

// CharData may be used to represent simple text data or a CDATA section
// within an XML document. The Data property should never be modified
// directly; use the SetData method instead.
// directly; use the SetData function instead.
type CharData struct {
Data string // the simple text or CDATA section content
parent *Element
Expand Down Expand Up @@ -298,9 +290,7 @@ type ProcInst struct {
// NewDocument creates an XML document without a root element.
func NewDocument() *Document {
return &Document{
Element: Element{Child: make([]Token, 0)},
ReadSettings: newReadSettings(),
WriteSettings: newWriteSettings(),
Element: Element{Child: make([]Token, 0)},
}
}

Expand Down Expand Up @@ -433,6 +423,9 @@ func validateXML(r io.Reader, settings ReadSettings) error {
func newDecoder(r io.Reader, settings ReadSettings) *xml.Decoder {
d := xml.NewDecoder(r)
d.CharsetReader = settings.CharsetReader
if d.CharsetReader == nil {
d.CharsetReader = defaultCharsetReader
}
d.Strict = !settings.Permissive
d.Entity = settings.Entity
d.AutoClose = settings.AutoClose
Expand Down
16 changes: 9 additions & 7 deletions etree_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -257,16 +257,14 @@ func TestDocumentCharsetReader(t *testing.T) {
</Book>
</Store>`

charsetLabel := ""
doc := newDocumentFromString2(t, s, ReadSettings{
CharsetReader: func(label string, input io.Reader) (io.Reader, error) {
charsetLabel = label
return &lowercaseCharsetReader{input}, nil
if label == "lowercase" {
return &lowercaseCharsetReader{input}, nil
}
return nil, errors.New("unknown charset")
},
})
if charsetLabel != "lowercase" {
t.Fatalf("etree: incorrect charset encoding, expected lowercase, got %s", charsetLabel)
}

cases := []struct {
path string
Expand Down Expand Up @@ -772,9 +770,13 @@ func TestSortAttrs(t *testing.T) {
checkStrEq(t, out, `<el AAA="1" Foo="2" a01="3" aaa="4" foo="5" z="6" สวัสดี="7" a:AAA="8" a:ZZZ="9"/>`+"\n")
}

func TestCharsetReaderEncoding(t *testing.T) {
func TestCharsetReaderDefaultSetting(t *testing.T) {
// Test encodings where the default pass-through charset conversion
// should work for common single-byte character encodings.
cases := []string{
`<?xml version="1.0"?><foo></foo>`,
`<?xml version="1.0" encoding="ISO-8859-1"?><foo></foo>`,
`<?xml version="1.0" encoding="Windows-1252"?><foo></foo>`,
`<?xml version="1.0" encoding="UTF-8"?><foo></foo>`,
`<?xml version="1.0" encoding="US-ASCII"?><foo></foo>`,
}
Expand Down

0 comments on commit e8292cc

Please sign in to comment.