-
Notifications
You must be signed in to change notification settings - Fork 0
/
u8xml.go
110 lines (102 loc) · 3.17 KB
/
u8xml.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
// Copyright 2024 Serguei Vine. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
//
// The u8xml package implements NewDecoder which can be used to parse
// XML files with IANA character encodings such as Windows-1252, ISO-8859-1, unicode,etc.
// It can be used to decode XML files/strings with Go Standard Library xml package
// Decoder type methods like Decode(), Token(), etc.
//
// XML files must contain a BOM at the beginning in the case of unicode characters or
// an XML declaration with an encoding attribute otherwise.
//
// XML files with UTF-8 content may be detected either by BOM or XML declaration.
// XML files with no BOM or XML declaration will be treated as UTF-8.
package u8xml
import (
"bufio"
"bytes"
"encoding/xml"
"io"
"golang.org/x/text/encoding/ianaindex"
"golang.org/x/text/transform"
)
var boms = []struct {
bom []byte
utf string
}{
{[]byte{0xFF, 0xFE, 0x00, 0x00}, "UTF-32LE"},
{[]byte{0x00, 0x00, 0xFE, 0xFF}, "UTF-32BE"},
{[]byte{0xEF, 0xBB, 0xBF}, "UTF-8"},
{[]byte{0xFF, 0xFE}, "UTF-16LE"},
{[]byte{0xFE, 0xFF}, "UTF-16BE"},
}
// DetectEncoding detects the encoding of a byte slice.
//
// Parameters:
// - buf: a byte slice to detect the encoding of.
//
// Returns:
// - string: the detected encoding, or default "UTF-8" if no BOM or XML declaration encoding attribute is found.
// - int: the length of the BOM if a BOM is found, or 0 otherwise.
func DetectEncoding(buf []byte) (string, int) {
// Check for a byte order mark (BOM) in the buffer.
// If found, return the corresponding encoding and the length of the BOM.
for _, b := range boms {
if len(buf) < len(b.bom) {
continue
}
if bytes.Equal(buf[:len(b.bom)], b.bom) {
return b.utf, len(b.bom)
}
}
// Check for an XML declaration with an encoding attribute.
// If found, return the encoding specified in the XML declaration.
buf = bytes.ReplaceAll(buf, []byte(`'`), []byte(`"`))
if len(buf) < 6 || !bytes.HasPrefix(buf, []byte("<?xml")) {
return "UTF-8", 0
}
encStart := bytes.Index(buf, []byte(`encoding="`))
if encStart == -1 {
return "UTF-8", 0
}
encEnd := bytes.Index(buf[encStart+11:], []byte(`"`))
if encEnd == -1 {
return "UTF-8", 0
}
return string(buf[encStart+10 : encStart+encEnd+11]), 0
}
const bufCapacity = 128
// NewReader implements an io reader that converts source bytes to UTF-8.
//
// r - input io.Reader
// Returns io.Reader, error
func NewReader(r io.Reader) (io.Reader, error) {
t := bufio.NewReader(r)
buf, _ := t.Peek(bufCapacity)
strEnc, bomLen := DetectEncoding(buf)
if bomLen > 0 {
t.Discard(bomLen) // skip BOM
}
if strEnc == "UTF-8" {
return t, nil
}
enc, err := ianaindex.IANA.Encoding(strEnc)
if err != nil {
return t, err // do not transform in the case of error
}
return transform.NewReader(t, enc.NewDecoder()), nil
}
// NewDecoder creates a new XML parser reading from r.
// Decoder converts source bytes to UTF-8
//
// r - input io.Reader
// Returns *xml.Decoder
func NewDecoder(r io.Reader) *xml.Decoder {
u8r, _ := NewReader(r)
d := xml.NewDecoder(u8r)
d.CharsetReader = func(chset string, input io.Reader) (io.Reader, error) {
return input, nil
}
return d
}