Skip to content

Commit

Permalink
Fix issue 289, add function is_binary, add explicit support py 3.12 (#…
Browse files Browse the repository at this point in the history
…306)

Release 3.2.0
  • Loading branch information
Ousret authored Jul 7, 2023
1 parent 1b0fb5c commit 782885e
Show file tree
Hide file tree
Showing 13 changed files with 155 additions and 14 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/cd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ permissions:

jobs:
pre_flight_check:
name: Preflight Checks
uses: ./.github/workflows/ci.yml

universal-wheel:
Expand Down Expand Up @@ -127,7 +128,7 @@ jobs:
id-token: write
contents: write
with:
subject-base64: ${{ needs.checksum.outputs.hashes }}
base64-subjects: ${{ needs.checksum.outputs.hashes }}
upload-assets: true

deploy:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [ "3.7", "3.8", "3.9", "3.10", "3.11" ]
python-version: [ "3.7", "3.8", "3.9", "3.10", "3.11" ] # , "3.12-dev"
os: [ ubuntu-latest, macos-latest, windows-latest ]
env:
PYTHONIOENCODING: utf8 # only needed for Windows (console IO output encoding)
Expand Down
11 changes: 10 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,19 @@
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [3.1.1.dev0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-05-??)
## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)

### Changed
- Typehint for function `from_path` no longer enforce `PathLike` as its first argument
- Minor improvement over the global detection reliability

### Added
- Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
- Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
- Explicit support for Python 3.12

### Fixed
- Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)

## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)

Expand Down
3 changes: 2 additions & 1 deletion charset_normalizer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"""
import logging

from .api import from_bytes, from_fp, from_path
from .api import from_bytes, from_fp, from_path, is_binary
from .legacy import detect
from .models import CharsetMatch, CharsetMatches
from .utils import set_logging_handler
Expand All @@ -31,6 +31,7 @@
"from_fp",
"from_path",
"from_bytes",
"is_binary",
"detect",
"CharsetMatch",
"CharsetMatches",
Expand Down
76 changes: 74 additions & 2 deletions charset_normalizer/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@


def from_bytes(
sequences: bytes,
sequences: Union[bytes, bytearray],
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.2,
Expand All @@ -40,6 +40,7 @@ def from_bytes(
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = True,
) -> CharsetMatches:
"""
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
Expand Down Expand Up @@ -361,7 +362,8 @@ def from_bytes(
)
# Preparing those fallbacks in case we got nothing.
if (
encoding_iana in ["ascii", "utf_8", specified_encoding]
enable_fallback
and encoding_iana in ["ascii", "utf_8", specified_encoding]
and not lazy_str_hard_failure
):
fallback_entry = CharsetMatch(
Expand Down Expand Up @@ -507,6 +509,7 @@ def from_fp(
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = True,
) -> CharsetMatches:
"""
Same thing than the function from_bytes but using a file pointer that is already ready.
Expand All @@ -522,6 +525,7 @@ def from_fp(
preemptive_behaviour,
explain,
language_threshold,
enable_fallback,
)


Expand All @@ -535,6 +539,7 @@ def from_path(
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = True,
) -> CharsetMatches:
"""
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
Expand All @@ -551,4 +556,71 @@ def from_path(
preemptive_behaviour,
explain,
language_threshold,
enable_fallback,
)


def is_binary(
fp_or_path_or_payload: Union[PathLike, str, BinaryIO, bytes], # type: ignore[type-arg]
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: Optional[List[str]] = None,
cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = False,
) -> bool:
"""
Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
are disabled to be stricter around ASCII-compatible but unlikely to be a string.
"""
if isinstance(fp_or_path_or_payload, (str, PathLike)):
guesses = from_path(
fp_or_path_or_payload,
steps=steps,
chunk_size=chunk_size,
threshold=threshold,
cp_isolation=cp_isolation,
cp_exclusion=cp_exclusion,
preemptive_behaviour=preemptive_behaviour,
explain=explain,
language_threshold=language_threshold,
enable_fallback=enable_fallback,
)
elif isinstance(
fp_or_path_or_payload,
(
bytes,
bytearray,
),
):
guesses = from_bytes(
fp_or_path_or_payload,
steps=steps,
chunk_size=chunk_size,
threshold=threshold,
cp_isolation=cp_isolation,
cp_exclusion=cp_exclusion,
preemptive_behaviour=preemptive_behaviour,
explain=explain,
language_threshold=language_threshold,
enable_fallback=enable_fallback,
)
else:
guesses = from_fp(
fp_or_path_or_payload,
steps=steps,
chunk_size=chunk_size,
threshold=threshold,
cp_isolation=cp_isolation,
cp_exclusion=cp_exclusion,
preemptive_behaviour=preemptive_behaviour,
explain=explain,
language_threshold=language_threshold,
enable_fallback=enable_fallback,
)

return not guesses
17 changes: 14 additions & 3 deletions charset_normalizer/md.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,14 +294,25 @@ def feed(self, character: str) -> None:
if buffer_length >= 4:
if self._buffer_accent_count / buffer_length > 0.34:
self._is_current_word_bad = True
# Word/Buffer ending with a upper case accentuated letter are so rare,
# Word/Buffer ending with an upper case accentuated letter are so rare,
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():
self._foreign_long_count += 1
self._is_current_word_bad = True
if buffer_length >= 24 and self._foreign_long_watch:
self._foreign_long_count += 1
self._is_current_word_bad = True
camel_case_dst = [
i
for c, i in zip(self._buffer, range(0, buffer_length))
if c.isupper()
]
probable_camel_cased: bool = False

if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
probable_camel_cased = True

if not probable_camel_cased:
self._foreign_long_count += 1
self._is_current_word_bad = True

if self._is_current_word_bad:
self._bad_word_count += 1
Expand Down
4 changes: 2 additions & 2 deletions charset_normalizer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,12 +120,12 @@ def is_emoticon(character: str) -> bool:

@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_separator(character: str) -> bool:
if character.isspace() or character in {"|", "+", ",", ";", "<", ">"}:
if character.isspace() or character in {"|", "+", "<", ">"}:
return True

character_category: str = unicodedata.category(character)

return "Z" in character_category
return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
Expand Down
2 changes: 1 addition & 1 deletion charset_normalizer/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
Expose version
"""

__version__ = "3.1.1.dev0"
__version__ = "3.2.0"
VERSION = __version__.split(".")
3 changes: 1 addition & 2 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Those functions are publicly exposed and are protected through our BC guarantee.
.. autofunction:: from_bytes
.. autofunction:: from_fp
.. autofunction:: from_path
.. autofunction:: is_binary

.. autoclass:: charset_normalizer.models.CharsetMatches
:inherited-members:
Expand Down Expand Up @@ -100,5 +101,3 @@ Some reusable functions used across the project. We do not guarantee the BC in t


.. class:: os.PathLike

Used as a generic way to accept AnyStr for paths.
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ Features
- Transpose any encoded content to Unicode the best we can.
- Detect spoken language in text.
- Ship with a great CLI.
- Also, detect binaries.

Start Guide
-----------
Expand Down
18 changes: 18 additions & 0 deletions docs/user/miscellaneous.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,21 @@ On `DEBUG` only one entry will be observed and that is about the detection resul

Then regarding the others log entries, they will be pushed as `Level 5`. Commonly known as TRACE level, but we do
not register it globally.


Detect binaries
---------------

This package offers a neat way to detect files that can be considered as 'binaries'
meaning that it is not likely to be a text-file.

::

from charset_normalizer import is_binary

# It can receive both a path or bytes or even a file pointer.
result = is_binary("./my-file.ext")

# This should print 'True' or 'False'
print(result)

1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ classifiers =
Programming Language :: Python :: 3.9
Programming Language :: Python :: 3.10
Programming Language :: Python :: 3.11
Programming Language :: Python :: 3.12
Programming Language :: Python :: Implementation :: PyPy
Topic :: Text Processing :: Linguistic
Topic :: Utilities
Expand Down
28 changes: 28 additions & 0 deletions tests/test_isbinary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import pytest
import typing
from io import BytesIO
from base64 import b64decode
from charset_normalizer import is_binary
from os import path, pardir

DIR_PATH = path.join(
path.dirname(path.realpath(__file__)),
pardir
)


@pytest.mark.parametrize(
"raw, expected",
[
(b'\x00\x5f\x2f\xff'*50, True),
(b64decode("R0lGODlhAQABAAAAACw="), True),
(BytesIO(b64decode("R0lGODlhAQABAAAAACw=")), True),
('sample-polish.txt', False),
('sample-arabic.txt', False)
]
)
def test_isbinary(raw: typing.Union[bytes, typing.BinaryIO, str], expected: bool) -> None:
if isinstance(raw, str):
raw = DIR_PATH + "/data/{}".format(raw)

assert is_binary(raw) is expected

0 comments on commit 782885e

Please sign in to comment.