Allow to read XML files in encodings different then UTF-8
diff --git a/etree.go b/etree.go
index e6fac66..e228e1b 100644
--- a/etree.go
+++ b/etree.go
@@ -24,6 +24,13 @@
// ErrXML is returned when XML parsing fails due to incorrect formatting.
var ErrXML = errors.New("etree: invalid XML format")
+// ReadSettings allow for changing the default behavior of the ReadFrom*
+// methods.
+type ReadSettings struct {
+ // CharsetReader to be passed to standard xml.Decoder. Default: nil.
+ CharsetReader func(charset string, input io.Reader) (io.Reader, error)
+}
+
// WriteSettings allow for changing the serialization behavior of the WriteTo*
// methods.
type WriteSettings struct {
@@ -42,6 +49,11 @@
CanonicalAttrVal bool
}
+// newReadSettings creates a default ReadSettings record.
+func newReadSettings() ReadSettings {
+ return ReadSettings{}
+}
+
// newWriteSettings creates a default WriteSettings record.
func newWriteSettings() WriteSettings {
return WriteSettings{
@@ -66,6 +78,7 @@
// processing instructions or BOM CharData tokens.
type Document struct {
Element
+ ReadSettings ReadSettings
WriteSettings WriteSettings
}
@@ -113,13 +126,14 @@
func NewDocument() *Document {
return &Document{
Element{Child: make([]Token, 0)},
+ newReadSettings(),
newWriteSettings(),
}
}
// Copy returns a recursive, deep copy of the document.
func (d *Document) Copy() *Document {
- return &Document{*(d.dup(nil).(*Element)), d.WriteSettings}
+ return &Document{*(d.dup(nil).(*Element)), d.ReadSettings, d.WriteSettings}
}
// Root returns the root element of the document, or nil if there is no root
@@ -157,7 +171,7 @@
// ReadFrom reads XML from the reader r into the document d. It returns the
// number of bytes read and any error encountered.
func (d *Document) ReadFrom(r io.Reader) (n int64, err error) {
- return d.Element.readFrom(r)
+ return d.Element.readFrom(r, d.ReadSettings.CharsetReader)
}
// ReadFromFile reads XML from the string s into the document d.
@@ -362,9 +376,10 @@
// ReadFrom reads XML from the reader r and stores the result as a new child
// of element e.
-func (e *Element) readFrom(ri io.Reader) (n int64, err error) {
+func (e *Element) readFrom(ri io.Reader, charsetReader func(charset string, input io.Reader) (io.Reader, error)) (n int64, err error) {
r := newCountReader(ri)
dec := xml.NewDecoder(r)
+ dec.CharsetReader = charsetReader
var stack stack
stack.push(e)
for {
diff --git a/etree_test.go b/etree_test.go
index be92652..e206e3b 100644
--- a/etree_test.go
+++ b/etree_test.go
@@ -4,7 +4,10 @@
package etree
-import "testing"
+import (
+ "io"
+ "testing"
+)
func checkEq(t *testing.T, got, want string) {
if got == want {
@@ -136,6 +139,25 @@
}
}
+func TestDocumentRead_NonUTF8Encodings(t *testing.T) {
+ s := `<?xml version="1.0" encoding="ISO-8859-1"?>
+ <store>
+ <book lang="en">
+ <title>Great Expectations</title>
+ <author>Charles Dickens</author>
+ </book>
+</store>`
+
+ doc := NewDocument()
+ doc.ReadSettings.CharsetReader = func(label string, input io.Reader) (io.Reader, error) {
+ return input, nil
+ }
+ err := doc.ReadFromString(s)
+ if err != nil {
+ t.Fatal("etree: incorrect ReadFromString result")
+ }
+}
+
func TestWriteSettings(t *testing.T) {
BOM := "\xef\xbb\xbf"