Skip to content

Commit

Permalink
add Guess*
Browse files Browse the repository at this point in the history
  • Loading branch information
yuin committed Jan 10, 2019
1 parent 891287c commit 8e6966c
Show file tree
Hide file tree
Showing 5 changed files with 141 additions and 5 deletions.
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ go get github.com/yuin/charsetutil

- `Decode*` : Converts from the specified charset to UTF-8.
- `Encode*` : Converts from the UTF-8 to specified charset.
- `Guess*` : Guesses a charcter set.

- `MustDecode*` : Same as `Decode*`, but panics when errors occur
- `MustEncode*` : Same as `Encode*`, but panics when errors occur


```go
b, err = EncodeString("こんにちわ", "Windows-31J")
b, err = Encode("こんにちわ", "Windows-31J")
Expand All @@ -35,6 +35,11 @@ s = MustDecodeString(string(source), "Windows-31J")
s = MustDecode(source, "Windows-31J")
s = MustDecodeBytes(source, "Windows-31J")
s = MustDecodeReader(bytes.NewReader(source), "Windows-31J")

cs, err := GuessString(string(source))
cs, err := GuessBytes(source)
cs, err := GuessReader(bytes.NewReader(source))
cs, err := Guess(source)
```

## Supported character sets
Expand Down
93 changes: 89 additions & 4 deletions charsetutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@ package charsetutil

import (
"bytes"
"errors"
"fmt"
"golang.org/x/net/html/charset"
"golang.org/x/text/transform"
"io"
"io/ioutil"
"strings"

"github.com/gogs/chardet"

"golang.org/x/net/html/charset"
"golang.org/x/text/transform"
)

func panicIfError(err error) {
Expand All @@ -17,6 +19,74 @@ func panicIfError(err error) {
}
}

// CharsetGuess is a guessd charcter set
type CharsetGuess interface {
// Charset returns a guessed charcter set
Charset() string

// Language returns a guessed language
Language() string

// Confidence returns a confidence of this guess
Confidence() int
}

type charsetGuess struct {
*chardet.Result
}

func (g *charsetGuess) Charset() string {
return g.Result.Charset
}

func (g *charsetGuess) Language() string {
return g.Result.Language
}

func (g *charsetGuess) Confidence() int {
return g.Result.Confidence
}

// GuessBytes guesses a character set of given bytes
func GuessBytes(s []byte) (CharsetGuess, error) {
detector := chardet.NewTextDetector()
result, err := detector.DetectBest(s)
if err != nil {
return nil, err
}
return &charsetGuess{result}, err
}

// Guess guesses a character set of given bytes
func Guess(s []byte) (CharsetGuess, error) {
return GuessBytes(s)
}

// GuessBytes guesses a character set of given Reader
func GuessReader(s io.Reader) (CharsetGuess, error) {
detector := chardet.NewTextDetector()
buf := make([]byte, 128)
if _, err := s.Read(buf); err != nil {
return nil, err
}
result, err := detector.DetectBest(buf)
if err != nil {
return nil, err
}
return &charsetGuess{result}, err
}

// GuessBytes guesses a character set of given string
func GuessString(s string) (CharsetGuess, error) {
detector := chardet.NewTextDetector()
result, err := detector.DetectBest([]byte(s))
if err != nil {
return nil, err
}
return &charsetGuess{result}, err
}

// DecodeReader converts given Reader to a UTF-8 string
func DecodeReader(s io.Reader, enc string) (string, error) {
reader, err := charset.NewReaderLabel(enc, s)
if err != nil {
Expand All @@ -29,46 +99,54 @@ func DecodeReader(s io.Reader, enc string) (string, error) {
return string(bytes), nil
}

// MustDecodeReader converts given Reader to a UTF-8 string and panics if errros occur.
func MustDecodeReader(s io.Reader, enc string) string {
ret, err := DecodeReader(s, enc)
panicIfError(err)
return ret
}

// DecodeBytes converts given bytes to a UTF-8 string
func DecodeBytes(s []byte, enc string) (string, error) {
return DecodeReader(bytes.NewReader(s), enc)
}

// MustDecodeBytes converts given bytes to a UTF-8 string and panics if errros occur.
func MustDecodeBytes(s []byte, enc string) string {
ret, err := DecodeReader(bytes.NewReader(s), enc)
panicIfError(err)
return ret
}

// DecodeString converts given string to a UTF-8 string
func DecodeString(s, enc string) (string, error) {
return DecodeReader(strings.NewReader(s), enc)
}

// MustDecodeString converts given string to a UTF-8 string and panics if errros occur.
func MustDecodeString(s, enc string) string {
ret, err := DecodeReader(strings.NewReader(s), enc)
panicIfError(err)
return ret
}

// DecodeBytes converts given bytes to a UTF-8 string
func Decode(s []byte, enc string) (string, error) {
return DecodeReader(bytes.NewReader(s), enc)
}

// MustDecodeBytes converts given bytes to a UTF-8 string and panics if errros occur.
func MustDecode(s []byte, enc string) string {
ret, err := DecodeReader(bytes.NewReader(s), enc)
panicIfError(err)
return ret
}

// EncodeReader converts a Reader to bytes encoded with given encoding
func EncodeReader(s io.Reader, enc string) ([]byte, error) {
e, _ := charset.Lookup(enc)
if e == nil {
return nil, errors.New(fmt.Sprintf("unsupported charset: %q", enc))
return nil, fmt.Errorf("unsupported charset: %q", enc)
}
var buf bytes.Buffer
writer := transform.NewWriter(&buf, e.NewEncoder())
Expand All @@ -79,36 +157,43 @@ func EncodeReader(s io.Reader, enc string) ([]byte, error) {
return buf.Bytes(), nil
}

// MustEncodeReader converts a Reader to bytes encoded with given encoding and panics if errors occur
func MustEncodeReader(s io.Reader, enc string) []byte {
ret, err := EncodeReader(s, enc)
panicIfError(err)
return ret
}

// EncodeBytes converts bytes to bytes encoded with given encoding
func EncodeBytes(s []byte, enc string) ([]byte, error) {
return EncodeReader(bytes.NewReader(s), enc)
}

// MustEncodeBytes converts a bytes to bytes encoded with given encoding and panics if errors occur
func MustEncodeBytes(s []byte, enc string) []byte {
ret, err := EncodeReader(bytes.NewReader(s), enc)
panicIfError(err)
return ret
}

// EncodeString converts a string to bytes encoded with given encoding
func EncodeString(s, enc string) ([]byte, error) {
return EncodeReader(strings.NewReader(s), enc)
}

// MustEncodeString converts a bytes to bytes encoded with given encoding and panics if errors occur
func MustEncodeString(s, enc string) []byte {
ret, err := EncodeReader(strings.NewReader(s), enc)
panicIfError(err)
return ret
}

// Encode converts a string to bytes encoded with given encoding
func Encode(s string, enc string) ([]byte, error) {
return EncodeReader(strings.NewReader(s), enc)
}

// MustEncode converts a bytes to bytes encoded with given encoding and panics if errors occur
func MustEncode(s string, enc string) []byte {
ret, err := EncodeReader(strings.NewReader(s), enc)
panicIfError(err)
Expand Down
30 changes: 30 additions & 0 deletions charsetutil_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,33 @@ func TestDecodeError(t *testing.T) {

assertPanic(func() string { return MustDecodeReader(bytes.NewReader(source), "unknown") })
}

func TestGuess(t *testing.T) {
sourceEuc := []byte{'\xa4', '\xa2', '\xa4', '\xa4', '\xa4', '\xa6', '\xa4', '\xa8', '\xa4', '\xaa', '\x0d', '\x0a', '\xa5', '\xbd', '\xc7', '\xbd', '\x0d', '\x0a', '\x74', '\x65', '\x73', '\x74', '\x0d', '\x0a', '\x8e', '\xb6', '\x8e', '\xb7', '\x8e', '\xb8', '\x8e', '\xb9', '\x8e', '\xba'}
// sourceSjis := []byte{'\x82', '\xa0', '\x82', '\xa2', '\x82', '\xa4', '\x82', '\xa6', '\x82', '\xa8', '\x0d', '\x0a', '\x83', '\x5c', '\x94', '\x5c', '\x0d', '\x0a', '\x74', '\x65', '\x73', '\x74', '\x0d', '\x0a', '\xb6', '\xb7', '\xb8', '\xb9', '\xba'}

assert := func(r CharsetGuess, charset, language string, err error) {
if err != nil {
t.Errorf("Failed:%+v", err)
}
if r.Charset() != charset {
t.Errorf("'%s' expected, but got '%s'", charset, r.Charset())
}
if r.Language() != language {
t.Errorf("'%s' expected, but got '%s'", language, r.Language())
}
}

result, err := Guess(sourceEuc)
assert(result, "EUC-JP", "ja", err)

result, err = GuessBytes(sourceEuc)
assert(result, "EUC-JP", "ja", err)

result, err = GuessReader(bytes.NewReader(sourceEuc))
assert(result, "EUC-JP", "ja", err)

result, err = GuessString("ああイイ”haa")
assert(result, "UTF-8", "", err)

}
8 changes: 8 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
module github.com/yuin/charsetutil

require (
github.com/gogs/chardet v0.0.0-20150115103509-2404f7772561
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect
golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e
golang.org/x/text v0.3.0
)
8 changes: 8 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
github.com/gogs/chardet v0.0.0-20150115103509-2404f7772561 h1:aBzukfDxQlCTVS0NBUjI5YA3iVeaZ9Tb5PxNrrIP1xs=
github.com/gogs/chardet v0.0.0-20150115103509-2404f7772561/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14=
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI=
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e h1:bRhVy7zSSasaqNksaRZiA5EEI+Ei4I1nO5Jh72wfHlg=
golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=

0 comments on commit 8e6966c

Please sign in to comment.