From 3986ec7a412bb13ffd017bdd4ce6c2ef97c8b9db Mon Sep 17 00:00:00 2001 From: Ralph Slooten Date: Thu, 29 Aug 2024 17:14:49 +1200 Subject: [PATCH 1/2] feat: add option to disable character detection Resolves #340 --- options.go | 12 ++++++++++++ parser.go | 1 + part.go | 3 ++- part_test.go | 23 +++++++++++++++++++++++ testdata/parts/chardet-detection.raw | 8 ++++++++ 5 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 testdata/parts/chardet-detection.raw diff --git a/options.go b/options.go index b7e6fd2..8b3104a 100644 --- a/options.go +++ b/options.go @@ -100,3 +100,15 @@ func (o disableTextConversionOption) apply(p *Parser) { func DisableTextConversion(disableTextConversion bool) Option { return disableTextConversionOption(disableTextConversion) } + +type disableCharacterDetectionOption bool + +func (o disableCharacterDetectionOption) apply(p *Parser) { + p.disableCharacterDetection = bool(o) +} + +// DisableCharacterDetection sets the disableCharacterDetection option. When true, the parser will use the +// defined character set if it is defined in the message part. +func DisableCharacterDetection(disableCharacterDetection bool) Option { + return disableCharacterDetectionOption(disableCharacterDetection) +} diff --git a/parser.go b/parser.go index 2ca6939..9e57124 100644 --- a/parser.go +++ b/parser.go @@ -27,6 +27,7 @@ type Parser struct { customParseMediaType CustomParseMediaType stripMediaTypeInvalidCharacters bool disableTextConversion bool + disableCharacterDetection bool } // defaultParser is a Parser with default configuration. diff --git a/part.go b/part.go index 5aa9e37..579f015 100644 --- a/part.go +++ b/part.go @@ -210,7 +210,8 @@ func (p *Part) convertFromDetectedCharset(r io.Reader, readPartErrorPolicy ReadP // Restore r. r = bytes.NewReader(buf) - if cs == nil || cs.Confidence < minCharsetConfidence || len(bytes.Runes(buf)) < minCharsetRuneLength { + if (p.parser.disableCharacterDetection && p.Charset != "") || + (cs == nil || cs.Confidence < minCharsetConfidence || len(bytes.Runes(buf)) < minCharsetRuneLength) { // Low confidence or not enough characters, use declared character set. return p.convertFromStatedCharset(r), nil } diff --git a/part_test.go b/part_test.go index 8a7774e..a098e50 100644 --- a/part_test.go +++ b/part_test.go @@ -1318,3 +1318,26 @@ func TestCtypeInvalidCharacters(t *testing.T) { test.ComparePart(t, p, wantp) } + +func TestDisableCharacterDetectionPart(t *testing.T) { + var wantp *enmime.Part + r := test.OpenTestData("parts", "chardet-detection.raw") + parser := enmime.NewParser(enmime.DisableCharacterDetection(true)) + p, err := parser.ReadParts(r) + + // Examine root + if err != nil { + t.Fatalf("Unexpected parse error: %+v", err) + } + if p == nil { + t.Fatal("Root node should not be nil") + } + + wantp = &enmime.Part{ + ContentType: "text/plain", + PartID: "0", + Charset: "utf-8", + } + + test.ComparePart(t, p, wantp) +} diff --git a/testdata/parts/chardet-detection.raw b/testdata/parts/chardet-detection.raw new file mode 100644 index 0000000..351d9af --- /dev/null +++ b/testdata/parts/chardet-detection.raw @@ -0,0 +1,8 @@ +Content-Type: text/plain; charset=utf-8 +Content-Transfer-Encoding: quoted-printable + +Loggen Sie sich ein, um die Einladung zu akzeptieren oder geben Sie den fol= +gen1233 + +Nachricht: +=C3=B6o=C3=B6o From fc26e5e2e31836ab3e0de10cea288a8d59abda85 Mon Sep 17 00:00:00 2001 From: James Hillyerd Date: Sat, 31 Aug 2024 12:23:13 -0700 Subject: [PATCH 2/2] add test comment Signed-off-by: James Hillyerd --- part_test.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/part_test.go b/part_test.go index a098e50..4be0d59 100644 --- a/part_test.go +++ b/part_test.go @@ -1321,6 +1321,8 @@ func TestCtypeInvalidCharacters(t *testing.T) { func TestDisableCharacterDetectionPart(t *testing.T) { var wantp *enmime.Part + + // chardet considers this test file to be ISO-8859-1. r := test.OpenTestData("parts", "chardet-detection.raw") parser := enmime.NewParser(enmime.DisableCharacterDetection(true)) p, err := parser.ReadParts(r)