diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 41df858..f086079 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,25 +1,31 @@ -name: Build, Release to PyPI +name: Release to PyPI on: push: - branches: - - main - pull_request: - branches: - - "**" +# branches: [ "main" ] + tags: [ "v**" ] +# pull_request: +# branches: +# - "**" jobs: - build: + release: + name: Release + if: startsWith(github.ref, 'refs/tags/') + needs: test runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: python-version: "3.10" - - name: Install - run: pip install -e . wheel twine - - name: Build + cache: "pip" + - name: Install release deps + run: pip install utz[setup] twine wheel + - name: Install dvc-utils + run: pip install -e . + - name: Build release run: python setup.py sdist bdist_wheel - - name: Publish to PyPI - run: twine upload dist/* + - name: Publish release env: TWINE_USERNAME: __token__ TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} + run: twine upload dist/* diff --git a/README.md b/README.md index c843df2..7f51cad 100644 --- a/README.md +++ b/README.md @@ -45,14 +45,21 @@ dvc-utils diff --help ``` ## Examples -Use [`parquet2json`] to observe schema changes to a Parquet file, in a given commit from [neighbor-ryan/nj-crashes](https://github.com/neighbor-ryan/nj-crashes): +See sample commands and output below for inspecting changes to [a DVC-tracked Parquet file][commit path] in [a given commit][commit]. + +```bash +git clone https://github.com/neighbor-ryan/nj-crashes +commit=c8ae28e +path=njdot/data/2001/NewJersey2001Accidents.pqt.dvc +``` + +### Parquet schema diff +Use [`parquet2json`] to observe schema changes to a Parquet file, in [a given commit][commit] from [neighbor-ryan/nj-crashes]: ```bash parquet_schema() { parquet2json "$1" schema } export -f parquet_schema -commit=7fa6a07 -path=njdot/data/2001/NewJersey2001Accidents.pqt.dvc dvc-utils diff -r $commit^..$commit parquet_schema $path ```
Output @@ -97,8 +104,12 @@ dvc-utils diff -r $commit^..$commit parquet_schema $path 51a48 > OPTIONAL INT64 Date (TIMESTAMP(MICROS,false)); ``` + +Here we can see that various date/time columns were consolidated, and several stringly-typed columns were converted to ints, floats, and booleans. +
+### Parquet row diff Diff the first row of the Parquet file above (pretty-printed as JSON), before and after the given commit: ```bash @@ -154,10 +165,26 @@ dvc-utils diff -r $commit^..$commit pretty_print_first_row $path > "Reporting Badge No.": "830", > "Date": "2001-12-21 18:34:00 +00:00" ``` + +This reflects the schema changes above. + +### Parquet row count diff +```bash +parquet_row_count() { + parquet2json "$1" rowcount +} +export -f parquet_row_count +dvc-utils diff -r $commit^..$commit parquet_row_count $path +``` +This time we get no output; [the given `$commit`][commit] didn't change the row count in the DVC-tracked Parquet file [`$path`][commit path]. [DVC]: https://dvc.org/ [`parquet2json`]: https://github.com/jupiter/parquet2json [neighbor-ryan/nj-crashes]: https://github.com/neighbor-ryan/nj-crashes +[Parquet]: https://parquet.apache.org/ +[commit]: https://github.com/neighbor-ryan/nj-crashes/commit/c8ae28e64f4917895d84074913f48e0a7afbc3d7 +[commit path]: https://github.com/neighbor-ryan/nj-crashes/commit/c8ae28e64f4917895d84074913f48e0a7afbc3d7#diff-7f812dce61e0996354f4af414203e0933ccdfe9613cb406c40c1c41a14b9769c +[neighbor-ryan/nj-crashes]: https://github.com/neighbor-ryan/nj-crashes diff --git a/setup.py b/setup.py index 2ebf42f..ec273f3 100644 --- a/setup.py +++ b/setup.py @@ -2,12 +2,19 @@ setup( name='dvc-utils', - version="0.0.1", - install_requires=open("requirements.txt").readlines(), + version="0.0.2", + description="CLI for diffing DVC files at two commits (or one commit vs. current worktree), optionally passing both through another command first", + long_description=open("README.md").read(), + long_description_content_type="text/markdown", packages=['dvc_utils'], entry_points={ 'console_scripts': [ 'dvc-utils = dvc_utils.main:cli', ], }, + license="MIT", + author="Ryan Williams", + author_email="ryan@runsascoded.com", + author_url="https://github.com/ryan-williams", + url="https://github.com/runsascoded/dvc-utils", )