🗓 Feb 1, 2024 12:11:12 AM

🐙 improve unicode decode/encode
securisec · Feb 1, 2024 · d459d84 · d459d84
1 parent 7dfcdf8
commit d459d84
Show file tree

Hide file tree

Showing 5 changed files with 46 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -96,10 +96,10 @@ pip3 install -e .
 # I use -e here so that if I update later with git pull, I dont have it install it again (unless dependencies have changed)
 ```
 
-#### [Docker](https://hub.docker.com/r/securisec/chepy)
+<!-- #### [Docker](https://hub.docker.com/r/securisec/chepy)
 ```bash
 docker run --rm -ti -v $PWD:/data securisec/chepy "some string" [somefile, "another string"]
-```
+``` -->
 
 #### Standalone binary
 One can build Chepy to be a standalone binary also. This includes packaging all the dependencies together.

diff --git a/chepy/__version__.py b/chepy/__version__.py
@@ -1,2 +1,2 @@
-__version__ = "6.5.0"  # pragma: no cover
+__version__ = "6.6.0"  # pragma: no cover
 __author__ = "@securisec"  # pragma: no cover
diff --git a/chepy/modules/language.py b/chepy/modules/language.py
@@ -89,23 +89,44 @@ def remove_diacritics(self) -> LanguageT:
         return self
 
     @ChepyDecorators.call_stack
-    def unicode_to_str(self) -> LanguageT:
+    def unicode_to_str(self, as_bytes=False) -> LanguageT:
         """Escape any \\u characters to its proper unicode representation
 
+        Args:
+            as_bytes (bool): Treat state as bytes. This does not handle %u or U+ encodings
+
         Returns:
             Chepy: The Chepy object.
         """
-        self.state = self._convert_to_bytes().decode(
-            "unicode-escape", errors="backslashreplace"
-        )
+        if as_bytes:
+            self.state = self._convert_to_bytes().decode(
+                "unicode-escape", errors="backslashreplace"
+            )
+        else:
+            data = self._convert_to_str()
+            cleaned_string = re.sub(r"(\\u|%u|U\+)", r"\\u", data)
+            self.state = bytes(cleaned_string, "utf-8").decode(
+                "unicode-escape", errors="backslashreplace"
+            )
         return self
 
     @ChepyDecorators.call_stack
-    def str_to_unicode(self) -> LanguageT:
+    def str_to_unicode(self, prefix: str = "\\u", all_chars: bool = False) -> LanguageT:
         """Convert unicode to str
 
+        Args:
+            prefix (str): Prefix character.
+            all_chars (bool): Force convert all chars to unicode.
+
         Returns:
             Chepy: The Chepy object.
         """
-        self.state = self._convert_to_str().encode("unicode_escape")
+        data = self._convert_to_str()
+        if all_chars:
+            hold = []
+            for d in data:
+                hold.append("{}{:04x}".format(prefix, ord(d)))
+            self.state = "".join(hold)
+            return self
+        self.state = data.encode("unicode_escape")
         return self
diff --git a/chepy/modules/language.pyi b/chepy/modules/language.pyi
@@ -51,5 +51,5 @@ class Language(ChepyCore):
     def encode_us_ascii_7_bit(self: LanguageT) -> LanguageT: ...
     def decode(self: LanguageT, encoding: ENCODINGS, errors: Literal['ignore', 'replace', 'backslashreplace']=...) -> LanguageT: ...
     def remove_diacritics(self: LanguageT) -> LanguageT: ...
-    def unicode_to_str(self: LanguageT) -> LanguageT: ...
-    def str_to_unicode(self: LanguageT) -> LanguageT: ...
+    def unicode_to_str(self: LanguageT, as_bytes: bool=False) -> LanguageT: ...
+    def str_to_unicode(self: LanguageT, prefix: Literal['\\u', '%u', 'U+']='\\u', all_chars: bool=False) -> LanguageT: ...
diff --git a/tests/test_language.py b/tests/test_language.py
@@ -63,7 +63,8 @@ def test_encode_cp932():
 
 def test_decode_cp932():
     assert (
-        Chepy("82b182f182c982bf82cd").hex_to_str().decode("cp932").o.decode() == "こんにちは"
+        Chepy("82b182f182c982bf82cd").hex_to_str().decode("cp932").o.decode()
+        == "こんにちは"
     )
 
 
@@ -177,14 +178,21 @@ def test_decode_cp1258():
         Chepy("745c75316561316d2062695c753165633774")
         .hex_to_str()
         .decode("cp1258")
-        .unicode_to_str()
+        .unicode_to_str(True)
         .o.decode()
         == "tạm biệt"
     )
 
 
-def test_str_to_unice():
+def test_unicode_to_str():
+    assert Chepy("U+0073U+0061U+006d").unicode_to_str().o == b"sam"
+
+
+def test_str_to_unicode():
     assert Chepy("籯").str_to_unicode().o == b"\\u7c6f"
+    assert (
+        Chepy(b"sam").str_to_unicode(all_chars=True).o.decode() == r"\u0073\u0061\u006d"
+    )
 
 
 def test_encode_iso8859_2():
@@ -333,6 +341,8 @@ def test_remove_diacritics():
 
 def test_us_ascii_7_bit():
     assert (
-        Chepy("걳걵걮걻걢갴걳갳걟갱갲갸걟갱갵걟걢갱건걟걲갳걭갴거거갱걮걧걽").encode_us_ascii_7_bit().o
+        Chepy("걳걵걮걻걢갴걳갳걟갱갲갸걟갱갵걟걢갱건걟걲갳걭갴거거갱걮걧걽")
+        .encode_us_ascii_7_bit()
+        .o
         == b"sun{b4s3_128_15_b1t_r3m4pp1ng}"
     )