🐛 output(...) replace declarative mark using non iana compliant encod…

…ing name close #572
jawah · Dec 24, 2024 · 14b4649 · 14b4649
1 parent 1b06bc0
commit 14b4649
Show file tree

Hide file tree

Showing 3 changed files with 10 additions and 7 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,7 +16,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 - `build-requirements.txt` as per using `pyproject.toml` native build configuration.
 - `bin/integration.py` and `bin/serve.py` in favor of downstream integration test (see noxfile).
 - `setup.cfg` in favor of `pyproject.toml` metadata configuration.
-- unused `util.unicode_range` function.
+- unused `utils.unicode_range` function.
+
+### Fixed
+- converting content to Unicode bytes may insert non-IANA compliant encoding name (e.g. `utf_8` instead of `utf-8`). (#572)
 
 ## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08)
 

diff --git a/src/charset_normalizer/models.py b/src/charset_normalizer/models.py
@@ -222,7 +222,7 @@ def output(self, encoding: str = "utf_8") -> bytes:
                     RE_POSSIBLE_ENCODING_INDICATION,
                     lambda m: m.string[m.span()[0] : m.span()[1]].replace(
                         m.groups()[0],
-                        iana_name(self._output_encoding),  # type: ignore[arg-type]
+                        iana_name(self._output_encoding).replace("_", "-"),  # type: ignore[arg-type]
                     ),
                     decoded_string[:8192],
                     count=1,

diff --git a/tests/test_preemptive_detection.py b/tests/test_preemptive_detection.py
@@ -34,7 +34,7 @@ def test_detect_most_common_body_encoding(payload, expected_encoding):
     [
         (
             b'<?xml version="1.0" encoding="EUC-JP"?>',
-            b'<?xml version="1.0" encoding="utf_8"?>',
+            b'<?xml version="1.0" encoding="utf-8"?>',
         ),
         (
             b'<html><head><meta charset="utf-8"></head></html>',
@@ -51,19 +51,19 @@ def test_detect_most_common_body_encoding(payload, expected_encoding):
         ),
         (
             b'<?xml version="1.0" encoding="US-ASCII"?>',
-            b'<?xml version="1.0" encoding="utf_8"?>',
+            b'<?xml version="1.0" encoding="utf-8"?>',
         ),
         (
             b'<?xml version="1.0" encoding="JohaB"?>',
-            b'<?xml version="1.0" encoding="utf_8"?>',
+            b'<?xml version="1.0" encoding="utf-8"?>',
         ),
         (
             b"<html><head><meta charset=WINDOWS-1252></head></html>",
-            b"<html><head><meta charset=utf_8></head></html>",
+            b"<html><head><meta charset=utf-8></head></html>",
         ),
         (
             b'<html><head><meta charset="WINDOWS-1256"></head></html>',
-            b'<html><head><meta charset="utf_8"></head></html>',
+            b'<html><head><meta charset="utf-8"></head></html>',
         ),
     ],
 )