diff --git a/.github/workflows/test-python.yml b/.github/workflows/test-python.yml index a5bc296..d2b6afc 100644 --- a/.github/workflows/test-python.yml +++ b/.github/workflows/test-python.yml @@ -11,21 +11,26 @@ concurrency: cancel-in-progress: true jobs: - # lint-python: - # name: Lint Python code - # runs-on: ubuntu-latest - # steps: - # - uses: actions/checkout@v4 - - # - name: Set up Python 3.8 - # uses: actions/setup-python@v2 - # with: - # python-version: "3.8" - - # - name: run pre-commit - # run: | - # python -m pip install pre-commit - # pre-commit run --all-files + pre-commit: + name: Run pre-commit on Python code + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Cache pre-commit virtualenvs + uses: actions/cache@v4 + with: + path: ~/.cache/pre-commit + key: pre-commit-3|${{ hashFiles('.pre-commit-config.yaml') }} + + - name: run pre-commit + run: | + python -m pip install pre-commit + pre-commit run --all-files test-python: name: Build and test Python diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..afc91dc --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,25 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks + +# Default to Python 3 +default_language_version: + python: python3 + +# Optionally both commit and push +default_stages: [commit] + +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.4.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-added-large-files + args: ["--maxkb=500"] + + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.2 + hooks: + - id: ruff + args: ["--fix"] + - id: ruff-format diff --git a/DEVELOP.md b/DEVELOP.md index d01772a..4af777d 100644 --- a/DEVELOP.md +++ b/DEVELOP.md @@ -10,3 +10,8 @@ poetry run maturin develop -m arro3-compute/Cargo.toml poetry run maturin develop -m arro3-io/Cargo.toml poetry run mkdocs serve ``` + +### Adding a new module + +- Add new module to Github Actions matrix in `wheels.yml` +- Update `docs.yml` to include module diff --git a/arro3-io/python/arro3/io/_io.pyi b/arro3-io/python/arro3/io/_io.pyi index 13d0f52..a3e52c4 100644 --- a/arro3-io/python/arro3/io/_io.pyi +++ b/arro3-io/python/arro3/io/_io.pyi @@ -28,8 +28,10 @@ def infer_csv_schema( Args: file: The input CSV path or buffer. has_header: Set whether the CSV file has a header. Defaults to None. - max_records: The maximum number of records to read to infer schema. Defaults to None. - delimiter: Set the CSV file's column delimiter as a byte character. Defaults to None. + max_records: The maximum number of records to read to infer schema. Defaults to + None. + delimiter: Set the CSV file's column delimiter as a byte character. Defaults to + None. escape: Set the CSV escape character. Defaults to None. quote: Set the CSV quote character. Defaults to None. terminator: Set the line terminator. Defaults to None. @@ -55,10 +57,14 @@ def read_csv( Args: file: The input CSV path or buffer. - schema: The Arrow schema for this CSV file. Use [infer_csv_schema][arro3.io.infer_csv_schema] to infer an Arrow schema if needed. + schema: The Arrow schema for this CSV file. Use + [infer_csv_schema][arro3.io.infer_csv_schema] to infer an Arrow schema if + needed. has_header: Set whether the CSV file has a header. Defaults to None. - batch_size: Set the batch size (number of records to load at one time). Defaults to None. - delimiter: Set the CSV file's column delimiter as a byte character. Defaults to None. + batch_size: Set the batch size (number of records to load at one time). + Defaults to None. + delimiter: Set the CSV file's column delimiter as a byte character. Defaults to + None. escape: Set the CSV escape character. Defaults to None. quote: Set the CSV quote character. Defaults to None. terminator: Set the line terminator. Defaults to None. @@ -89,7 +95,8 @@ def write_csv( data: The Arrow Table, RecordBatchReader, or RecordBatch to write. file: The output buffer or file path for where to write the CSV. header: Set whether to write the CSV file with a header. Defaults to None. - delimiter: Set the CSV file's column delimiter as a byte character. Defaults to None. + delimiter: Set the CSV file's column delimiter as a byte character. Defaults to + None. escape: Set the CSV file's escape character as a byte character. In some variants of CSV, quotes are escaped using a special escape character @@ -119,7 +126,8 @@ def infer_json_schema( Args: file: The input JSON path or buffer. - max_records: The maximum number of records to read to infer schema. If not provided, will read the entire file to deduce field types. Defaults to None. + max_records: The maximum number of records to read to infer schema. If not + provided, will read the entire file to deduce field types. Defaults to None. Returns: Inferred Arrow Schema @@ -136,7 +144,8 @@ def read_json( Args: file: The JSON file or buffer to read from. schema: The Arrow schema representing the JSON data. - batch_size: Set the batch size (number of records to load at one time). Defaults to None. + batch_size: Set the batch size (number of records to load at one time). Defaults + to None. Returns: An arrow RecordBatchReader. @@ -148,7 +157,7 @@ def write_json( *, explicit_nulls: bool | None = None, ) -> None: - """Write + """Write Arrow data to JSON. By default the writer will skip writing keys with null values for backward compatibility. @@ -156,7 +165,8 @@ def write_json( Args: data: the Arrow Table, RecordBatchReader, or RecordBatch to write. file: the output file or buffer to write to - explicit_nulls: Set whether to keep keys with null values, or to omit writing them. Defaults to skipping nulls. + explicit_nulls: Set whether to keep keys with null values, or to omit writing + them. Defaults to skipping nulls. """ def write_ndjson( @@ -165,14 +175,16 @@ def write_ndjson( *, explicit_nulls: bool | None = None, ) -> None: - """ + """Write Arrow data to newline-delimited JSON. - By default the writer will skip writing keys with null values for backward compatibility. + By default the writer will skip writing keys with null values for backward + compatibility. Args: data: the Arrow Table, RecordBatchReader, or RecordBatch to write. file: the output file or buffer to write to - explicit_nulls: Set whether to keep keys with null values, or to omit writing them. Defaults to skipping nulls. + explicit_nulls: Set whether to keep keys with null values, or to omit writing + them. Defaults to skipping nulls. """ #### IPC @@ -285,18 +297,26 @@ def write_parquet( file: The output file. Keyword Args: - bloom_filter_enabled: Sets if bloom filter is enabled by default for all columns (defaults to `false`). - bloom_filter_fpp: Sets the default target bloom filter false positive probability (fpp) for all columns (defaults to `0.05`). - bloom_filter_ndv: Sets default number of distinct values (ndv) for bloom filter for all columns (defaults to `1_000_000`). - column_compression: Sets compression codec for a specific column. Takes precedence over `compression`. - column_dictionary_enabled: Sets flag to enable/disable dictionary encoding for a specific column. Takes precedence over `dictionary_enabled`. - column_encoding: Sets encoding for a specific column. Takes precedence over `encoding`. - column_max_statistics_size: Sets max size for statistics for a specific column. Takes precedence over `max_statistics_size`. + bloom_filter_enabled: Sets if bloom filter is enabled by default for all columns + (defaults to `false`). + bloom_filter_fpp: Sets the default target bloom filter false positive + probability (fpp) for all columns (defaults to `0.05`). + bloom_filter_ndv: Sets default number of distinct values (ndv) for bloom filter + for all columns (defaults to `1_000_000`). + column_compression: Sets compression codec for a specific column. Takes + precedence over `compression`. + column_dictionary_enabled: Sets flag to enable/disable dictionary encoding for a + specific column. Takes precedence over `dictionary_enabled`. + column_encoding: Sets encoding for a specific column. Takes precedence over + `encoding`. + column_max_statistics_size: Sets max size for statistics for a specific column. + Takes precedence over `max_statistics_size`. compression: Sets default compression codec for all columns (default to `uncompressed`). Note that you can pass in a custom compression level with a string like `"zstd(3)"` or `"gzip(9)"` or `"brotli(3)"`. - created_by: Sets "created by" property (defaults to `parquet-rs version `). + created_by: Sets "created by" property (defaults to `parquet-rs version + `). data_page_row_count_limit: Sets best effort maximum number of rows in a data page (defaults to `20_000`). @@ -318,7 +338,8 @@ def write_parquet( during reading. Note: this is a best effort limit based on value of `set_write_batch_size`. - dictionary_enabled: Sets default flag to enable/disable dictionary encoding for all columns (defaults to `True`). + dictionary_enabled: Sets default flag to enable/disable dictionary encoding for + all columns (defaults to `True`). dictionary_page_size_limit: Sets best effort maximum dictionary page size, in bytes (defaults to `1024 * 1024`). @@ -337,9 +358,13 @@ def write_parquet( columns. In case when dictionary is enabled for any column, this value is considered to be a fallback encoding for that column. key_value_metadata: Sets "key_value_metadata" property (defaults to `None`). - max_row_group_size: Sets maximum number of rows in a row group (defaults to `1024 * 1024`). - max_statistics_size: Sets default max statistics size for all columns (defaults to `4096`). - skip_arrow_metadata: Parquet files generated by this writer contain embedded arrow schema by default. Set `skip_arrow_metadata` to `True`, to skip encoding the embedded metadata (defaults to `False`). + max_row_group_size: Sets maximum number of rows in a row group (defaults to + `1024 * 1024`). + max_statistics_size: Sets default max statistics size for all columns (defaults + to `4096`). + skip_arrow_metadata: Parquet files generated by this writer contain embedded + arrow schema by default. Set `skip_arrow_metadata` to `True`, to skip + encoding the embedded metadata (defaults to `False`). write_batch_size: Sets write batch size (defaults to 1024). @@ -349,6 +374,8 @@ def write_parquet( Additional limits such as such as `set_data_page_row_count_limit` are checked between batches, and thus the write batch size value acts as an upper-bound on the enforcement granularity of other limits. - writer_version: Sets the `WriterVersion` written into the parquet metadata (defaults to `"parquet_1_0"`). This value can determine what features some readers will support. + writer_version: Sets the `WriterVersion` written into the parquet metadata + (defaults to `"parquet_1_0"`). This value can determine what features some + readers will support. """ diff --git a/pyproject.toml b/pyproject.toml index 2dd7206..30f3608 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,3 +20,21 @@ mkdocstrings = { version = "^0.25.1", extras = ["python"] } # https://github.com/squidfunk/mkdocs-material/issues/6983 mkdocs-material = { version = "^9.5.17", extras = ["imaging"] } mike = "^2" + + +[tool.ruff] +select = [ + # Pyflakes + "F", + # Pycodestyle + # "E", + "W", + # isort + "I", +] + +[tool.ruff.extend-per-file-ignores] +"__init__.py" = [ + "F401", # Allow unused imports in __init__.py files + "F403", # unable to detect undefined names +] diff --git a/tests/core/test_constructors.py b/tests/core/test_constructors.py index f8e1f7a..5f08d4a 100644 --- a/tests/core/test_constructors.py +++ b/tests/core/test_constructors.py @@ -3,12 +3,12 @@ from arro3.core import ( Array, DataType, - fixed_size_list_array, Field, + fixed_size_list_array, list_array, + list_offsets, struct_array, ) -from arro3.core import list_offsets def test_fixed_size_list_array(): diff --git a/tests/core/test_table.py b/tests/core/test_table.py index 83e80f8..b8c126d 100644 --- a/tests/core/test_table.py +++ b/tests/core/test_table.py @@ -3,7 +3,7 @@ import pandas as pd import pyarrow as pa import pytest -from arro3.core import ChunkedArray, Table, Array, Field, DataType +from arro3.core import Array, ChunkedArray, DataType, Field, Table def test_table_getitem(): diff --git a/tests/io/test_ipc.py b/tests/io/test_ipc.py index 69db4c9..95e1449 100644 --- a/tests/io/test_ipc.py +++ b/tests/io/test_ipc.py @@ -1,7 +1,8 @@ -from arro3.io import read_ipc, write_ipc, read_ipc_stream, write_ipc_stream from io import BytesIO from pathlib import Path + import pyarrow as pa +from arro3.io import read_ipc, read_ipc_stream, write_ipc, write_ipc_stream def test_ipc_round_trip_string():