x/net/html/charset: add NewReaderByName
This provides a CharsetReader function for xml.Decoder.
Change-Id: Id00787bbdee90d267d38c84c98a06f9e10d93336
Reviewed-on: https://go-review.googlesource.com/4420
Reviewed-by: Nigel Tao <nigeltao@golang.org>
diff --git a/html/charset/charset.go b/html/charset/charset.go
index 2e5f9ba..84e6062 100644
--- a/html/charset/charset.go
+++ b/html/charset/charset.go
@@ -10,6 +10,7 @@
import (
"bytes"
+ "fmt"
"io"
"mime"
"strings"
@@ -110,6 +111,18 @@
return r, nil
}
+// NewReaderByName returns a reader that converts from the specified charset to
+// UTF-8. It returns an error if the charset is not one of the standard
+// encodings for HTML. It is suitable for use as encoding/xml.Decoder's
+// CharsetReader function.
+func NewReaderByName(charset string, input io.Reader) (io.Reader, error) {
+ e, _ := Lookup(charset)
+ if e == nil {
+ return nil, fmt.Errorf("unsupported charset: %q", charset)
+ }
+ return transform.NewReader(input, e.NewDecoder()), nil
+}
+
func prescan(content []byte) (e encoding.Encoding, name string) {
z := html.NewTokenizer(bytes.NewReader(content))
for {
diff --git a/html/charset/charset_test.go b/html/charset/charset_test.go
index d309f75..44a1867 100644
--- a/html/charset/charset_test.go
+++ b/html/charset/charset_test.go
@@ -6,6 +6,7 @@
import (
"bytes"
+ "encoding/xml"
"io/ioutil"
"runtime"
"strings"
@@ -213,3 +214,23 @@
}
}
}
+
+func TestXML(t *testing.T) {
+ const s = "<?xml version=\"1.0\" encoding=\"windows-1252\"?><a><Word>r\xe9sum\xe9</Word></a>"
+
+ d := xml.NewDecoder(strings.NewReader(s))
+ d.CharsetReader = NewReaderByName
+
+ var a struct {
+ Word string
+ }
+ err := d.Decode(&a)
+ if err != nil {
+ t.Fatalf("Decode: %v", err)
+ }
+
+ want := "résumé"
+ if a.Word != want {
+ t.Errorf("got %q, want %q", a.Word, want)
+ }
+}