Skip to content

Commit

Permalink
🗓 Feb 1, 2024 12:11:12 AM
Browse files Browse the repository at this point in the history
🐙 improve unicode decode/encode
  • Loading branch information
securisec committed Feb 1, 2024
1 parent 7dfcdf8 commit d459d84
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 15 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,10 @@ pip3 install -e .
# I use -e here so that if I update later with git pull, I dont have it install it again (unless dependencies have changed)
```

#### [Docker](https://hub.docker.com/r/securisec/chepy)
<!-- #### [Docker](https://hub.docker.com/r/securisec/chepy)
```bash
docker run --rm -ti -v $PWD:/data securisec/chepy "some string" [somefile, "another string"]
```
``` -->

#### Standalone binary
One can build Chepy to be a standalone binary also. This includes packaging all the dependencies together.
Expand Down
2 changes: 1 addition & 1 deletion chepy/__version__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__version__ = "6.5.0" # pragma: no cover
__version__ = "6.6.0" # pragma: no cover
__author__ = "@securisec" # pragma: no cover
33 changes: 27 additions & 6 deletions chepy/modules/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,23 +89,44 @@ def remove_diacritics(self) -> LanguageT:
return self

@ChepyDecorators.call_stack
def unicode_to_str(self) -> LanguageT:
def unicode_to_str(self, as_bytes=False) -> LanguageT:
"""Escape any \\u characters to its proper unicode representation
Args:
as_bytes (bool): Treat state as bytes. This does not handle %u or U+ encodings
Returns:
Chepy: The Chepy object.
"""
self.state = self._convert_to_bytes().decode(
"unicode-escape", errors="backslashreplace"
)
if as_bytes:
self.state = self._convert_to_bytes().decode(
"unicode-escape", errors="backslashreplace"
)
else:
data = self._convert_to_str()
cleaned_string = re.sub(r"(\\u|%u|U\+)", r"\\u", data)
self.state = bytes(cleaned_string, "utf-8").decode(
"unicode-escape", errors="backslashreplace"
)
return self

@ChepyDecorators.call_stack
def str_to_unicode(self) -> LanguageT:
def str_to_unicode(self, prefix: str = "\\u", all_chars: bool = False) -> LanguageT:
"""Convert unicode to str
Args:
prefix (str): Prefix character.
all_chars (bool): Force convert all chars to unicode.
Returns:
Chepy: The Chepy object.
"""
self.state = self._convert_to_str().encode("unicode_escape")
data = self._convert_to_str()
if all_chars:
hold = []
for d in data:
hold.append("{}{:04x}".format(prefix, ord(d)))
self.state = "".join(hold)
return self
self.state = data.encode("unicode_escape")
return self
4 changes: 2 additions & 2 deletions chepy/modules/language.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -51,5 +51,5 @@ class Language(ChepyCore):
def encode_us_ascii_7_bit(self: LanguageT) -> LanguageT: ...
def decode(self: LanguageT, encoding: ENCODINGS, errors: Literal['ignore', 'replace', 'backslashreplace']=...) -> LanguageT: ...
def remove_diacritics(self: LanguageT) -> LanguageT: ...
def unicode_to_str(self: LanguageT) -> LanguageT: ...
def str_to_unicode(self: LanguageT) -> LanguageT: ...
def unicode_to_str(self: LanguageT, as_bytes: bool=False) -> LanguageT: ...
def str_to_unicode(self: LanguageT, prefix: Literal['\\u', '%u', 'U+']='\\u', all_chars: bool=False) -> LanguageT: ...
18 changes: 14 additions & 4 deletions tests/test_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ def test_encode_cp932():

def test_decode_cp932():
assert (
Chepy("82b182f182c982bf82cd").hex_to_str().decode("cp932").o.decode() == "こんにちは"
Chepy("82b182f182c982bf82cd").hex_to_str().decode("cp932").o.decode()
== "こんにちは"
)


Expand Down Expand Up @@ -177,14 +178,21 @@ def test_decode_cp1258():
Chepy("745c75316561316d2062695c753165633774")
.hex_to_str()
.decode("cp1258")
.unicode_to_str()
.unicode_to_str(True)
.o.decode()
== "tạm biệt"
)


def test_str_to_unice():
def test_unicode_to_str():
assert Chepy("U+0073U+0061U+006d").unicode_to_str().o == b"sam"


def test_str_to_unicode():
assert Chepy("籯").str_to_unicode().o == b"\\u7c6f"
assert (
Chepy(b"sam").str_to_unicode(all_chars=True).o.decode() == r"\u0073\u0061\u006d"
)


def test_encode_iso8859_2():
Expand Down Expand Up @@ -333,6 +341,8 @@ def test_remove_diacritics():

def test_us_ascii_7_bit():
assert (
Chepy("걳걵걮걻걢갴걳갳걟갱갲갸걟갱갵걟걢갱건걟걲갳걭갴거거갱걮걧걽").encode_us_ascii_7_bit().o
Chepy("걳걵걮걻걢갴걳갳걟갱갲갸걟갱갵걟걢갱건걟걲갳걭갴거거갱걮걧걽")
.encode_us_ascii_7_bit()
.o
== b"sun{b4s3_128_15_b1t_r3m4pp1ng}"
)

0 comments on commit d459d84

Please sign in to comment.