Skip to content

Commit

Permalink
test: update remaining python-* to use --gunzip
Browse files Browse the repository at this point in the history
This also fixes an issue with installRequirements: make sure to clean out venv if we aren't done yet and the context was cancelled

Signed-off-by: Nick Mitchell <nickm@us.ibm.com>
  • Loading branch information
starpit committed Nov 29, 2024
1 parent bc9d0eb commit 991a52c
Show file tree
Hide file tree
Showing 51 changed files with 55 additions and 41 deletions.
16 changes: 16 additions & 0 deletions pkg/runtime/needs/install_requirements.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,13 +95,29 @@ pip%s install %s %s -r %s %s 1>&2`, version, venvPath, venvPath, version, versio
cmd.Stdout = os.Stderr // Stderr so as not to collide with `lunchpail needs` stdout
cmd.Stderr = os.Stderr

alreadyCleanedUp := false
installSuccessful := false
go func() {
select {
case <-ctx.Done():
if !installSuccessful && !alreadyCleanedUp {
if err := os.RemoveAll(venvPath); err != nil {
fmt.Fprintln(os.Stderr, "Unable to clean up venv cache directory after pip install failure", err)
}
alreadyCleanedUp = true
}
}
}()

if err := cmd.Run(); err != nil {
// Clean up the venv cache directory, since we failed at populating it
if err := os.RemoveAll(venvPath); err != nil {
fmt.Fprintln(os.Stderr, "Unable to clean up venv cache directory after pip install failure", err)
}
alreadyCleanedUp = true
return path, err
}
installSuccessful = true

return path, nil
}
Expand Down
1 change: 1 addition & 0 deletions tests/tests/python-code-header-cleanser/pail/command
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python3.10 main.py
1 change: 1 addition & 0 deletions tests/tests/python-code-header-cleanser/pail/image
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
docker.io/python:3.10
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@ data-prep-toolkit==0.2.2.dev1
scancode-toolkit ; platform_system != 'Darwin'

# we can probably update to 18+, but we will have to re-generate expected output as pyarrow 18 seems to have resulted in a binary format change
pyarrow<18
pyarrow<17

setuptools
Binary file not shown.
2 changes: 1 addition & 1 deletion tests/tests/python-code-header-cleanser/post.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@ function validate {
rm -f "$actual"
}

validate task.1.txt "$DATA"/expected/test1.parquet.gz
validate test1.parquet "$DATA"/expected/test1.parquet.gz
2 changes: 1 addition & 1 deletion tests/tests/python-code-header-cleanser/settings.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ NUM_DESIRED_OUTPUTS=0
# the default is --yaml. we don't want that
source_from=" "

up_args='<(gunzip -c "$TEST_PATH"/pail/test-data/input/test1.parquet.gz)'
up_args='--gunzip "$TEST_PATH"/pail/test-data/input/test1.parquet.gz'
3 changes: 1 addition & 2 deletions tests/tests/python-language-doc-chunk/pail/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,5 @@ llama-index-core>=0.11.0,<0.12.0
# sigh. see https://github.com/run-llama/llama_index/issues/17016
pydantic==2.9.2

# we can probably update to 18+, but we will have to re-generate expected output as pyarrow 18 seems to have resulted in a binary format change
pyarrow<18
pyarrow

Binary file not shown.
Binary file not shown.
4 changes: 2 additions & 2 deletions tests/tests/python-language-doc-chunk/post.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ DATA="$TEST_PATH"/pail/test-data

for i in $(seq 1 1)
do
actual=task.${i}_0.parquet # pkg/boot/up.go currently downloads named pipes (see up_args in settings.sh) to cwd
expected="$DATA"/expected/test$i.parquet.gz
actual=test${i}_0.parquet
expected="$DATA"/expected/test${i}_0.parquet.gz

while true
do
Expand Down
2 changes: 1 addition & 1 deletion tests/tests/python-language-doc-chunk/settings.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ NUM_DESIRED_OUTPUTS=0
# the default is --yaml. we don't want that
source_from=" "

up_args='<(gunzip -c "$TEST_PATH"/pail/test-data/input/test1.parquet.gz)'
up_args='--gunzip "$TEST_PATH"/pail/test-data/input/test1.parquet.gz'
2 changes: 1 addition & 1 deletion tests/tests/python-language-doc-quality/post.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ DATA="$TEST_PATH"/pail/test-data

for i in $(seq 1 1)
do
actual=task.$i.txt # pkg/boot/up.go currently downloads named pipes (see up_args in settings.sh) to cwd
actual=test$i.parquet
expected="$DATA"/expected/test$i.parquet.gz

while true
Expand Down
2 changes: 1 addition & 1 deletion tests/tests/python-language-doc-quality/settings.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ NUM_DESIRED_OUTPUTS=0
# the default is --yaml. we don't want that
source_from=" "

up_args='<(gunzip -c "$TEST_PATH"/pail/test-data/input/test1.parquet.gz)'
up_args='--gunzip "$TEST_PATH"/pail/test-data/input/test1.parquet.gz'
3 changes: 1 addition & 2 deletions tests/tests/python-language-lang-id/pail/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,5 @@ langcodes==3.3.0
huggingface-hub >= 0.21.4, <1.0.0
numpy==1.26.4

# we can probably update to 18+, but we will have to re-generate expected output as pyarrow 18 seems to have resulted in a binary format change
pyarrow<18
pyarrow

Binary file not shown.
Binary file not shown.
Binary file not shown.
2 changes: 1 addition & 1 deletion tests/tests/python-language-lang-id/post.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ DATA="$TEST_PATH"/pail/test-data/sm

for i in $(seq 1 3)
do
actual=task.$i.txt # pkg/boot/up.go currently downloads named pipes (see up_args in settings.sh) to cwd
actual=test_0$i.parquet
expected="$DATA"/expected/test_0$i.parquet.gz

while true
Expand Down
2 changes: 1 addition & 1 deletion tests/tests/python-language-lang-id/settings.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ NUM_DESIRED_OUTPUTS=0
# the default is --yaml. we don't want that
source_from=" "

up_args='<(gunzip -c "$TEST_PATH"/pail/test-data/sm/input/test_01.parquet.gz) <(gunzip -c "$TEST_PATH"/pail/test-data/sm/input/test_02.parquet.gz) <(gunzip -c "$TEST_PATH"/pail/test-data/sm/input/test_03.parquet.gz)'
up_args='--gunzip "$TEST_PATH"/pail/test-data/sm/input/test_01.parquet.gz "$TEST_PATH"/pail/test-data/sm/input/test_02.parquet.gz "$TEST_PATH"/pail/test-data/sm/input/test_03.parquet.gz'
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,5 @@ presidio-anonymizer>=2.2.355
flair>=0.14.0
pandas>=2.2.2

# we can probably update to 18+, but we will have to re-generate expected output as pyarrow 18 seems to have resulted in a binary format change
pyarrow<18

pyarrow
spacy
Binary file not shown.
2 changes: 1 addition & 1 deletion tests/tests/python-language-pii-redactor/post.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

DATA="$TEST_PATH"/pail/test-data

actual=task.1.txt # pkg/boot/up.go currently downloads named pipes (see up_args in settings.sh) to cwd
actual=1.parquet
expected="$DATA"/xs/1.expected.parquet.gz

while true
Expand Down
2 changes: 1 addition & 1 deletion tests/tests/python-language-pii-redactor/settings.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ NUM_DESIRED_OUTPUTS=0
# the default is --yaml. we don't want that
source_from=" "

up_args='<(gunzip -c "$TEST_PATH"/pail/test-data/xs/1.parquet.gz)'
up_args='--gunzip "$TEST_PATH"/pail/test-data/xs/1.parquet.gz'
1 change: 1 addition & 0 deletions tests/tests/python-universal-doc-id/pail/command
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python3.12 main.py
1 change: 1 addition & 0 deletions tests/tests/python-universal-doc-id/pail/image
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
docker.io/python:3.12
4 changes: 1 addition & 3 deletions tests/tests/python-universal-doc-id/pail/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
data-prep-toolkit==0.2.2.dev0

# we can probably update to 18+, but we will have to re-generate expected output as pyarrow 18 seems to have resulted in a binary format change
pyarrow<18

pyarrow<17
setuptools
Binary file not shown.
2 changes: 1 addition & 1 deletion tests/tests/python-universal-doc-id/post.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@ function validate {
rm -f "$actual"
}

validate task.1.txt "$DATA"/expected/sample1.parquet.gz
validate sample1.parquet "$DATA"/expected/sample1.parquet.gz
2 changes: 1 addition & 1 deletion tests/tests/python-universal-doc-id/settings.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ NUM_DESIRED_OUTPUTS=0
# the default is --yaml. we don't want that
source_from=" "

up_args='<(gunzip -c "$TEST_PATH"/pail/test-data/input/sample1.parquet.gz)'
up_args='--gunzip "$TEST_PATH"/pail/test-data/input/sample1.parquet.gz'
1 change: 1 addition & 0 deletions tests/tests/python-universal-ededup/pail/command
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python3.12 main.py
1 change: 1 addition & 0 deletions tests/tests/python-universal-ededup/pail/image
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
docker.io/python:3.12
2 changes: 1 addition & 1 deletion tests/tests/python-universal-ededup/pail/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ mmh3==4.1.0
xxhash==3.4.1

# we can probably update to 18+, but we will have to re-generate expected output as pyarrow 18 seems to have resulted in a binary format change
pyarrow<18
pyarrow<17

setuptools
2 changes: 1 addition & 1 deletion tests/tests/python-universal-ededup/post.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@ function validate {
rm -f "$actual"
}

validate task.1.txt "$DATA"/expected/sample1.parquet.gz
validate sample1.parquet "$DATA"/expected/sample1.parquet.gz
2 changes: 1 addition & 1 deletion tests/tests/python-universal-ededup/settings.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ NUM_DESIRED_OUTPUTS=0
# the default is --yaml. we don't want that
source_from=" "

up_args='<(gunzip -c "$TEST_PATH"/pail/test-data/input/sample1.parquet.gz)'
up_args='--gunzip "$TEST_PATH"/pail/test-data/input/sample1.parquet.gz'
5 changes: 2 additions & 3 deletions tests/tests/python-universal-filter/pail/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
duckdb==0.10.1
duckdb==1.1.3

# we can probably update to 18+, but we will have to re-generate expected output as pyarrow 18 seems to have resulted in a binary format change
pyarrow<18
pyarrow

Binary file not shown.
2 changes: 1 addition & 1 deletion tests/tests/python-universal-filter/post.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@ function validate {
rm -f "$actual"
}

validate task.1.txt "$DATA"/expected/test1.parquet.gz
validate test1.parquet "$DATA"/expected/test1.parquet.gz
2 changes: 1 addition & 1 deletion tests/tests/python-universal-filter/settings.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ NUM_DESIRED_OUTPUTS=0
# the default is --yaml. we don't want that
source_from=" "

up_args='<(gunzip -c "$TEST_PATH"/pail/test-data/input/test1.parquet.gz)'
up_args='--gunzip "$TEST_PATH"/pail/test-data/input/test1.parquet.gz'
3 changes: 1 addition & 2 deletions tests/tests/python-universal-resize/pail/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
# we can probably update to 18+, but we will have to re-generate expected output as pyarrow 18 seems to have resulted in a binary format change
pyarrow<18
pyarrow

Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
12 changes: 6 additions & 6 deletions tests/tests/python-universal-resize/post.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ function validate {
rm -f "$actual"
}

validate task.1_0.parquet "$DATA"/expected/task.1_0.parquet.gz
validate task.1_1.parquet "$DATA"/expected/task.1_1.parquet.gz
validate task.2_0.parquet "$DATA"/expected/task.2_0.parquet.gz
validate task.2_1.parquet "$DATA"/expected/task.2_1.parquet.gz
validate task.3_0.parquet "$DATA"/expected/task.3_0.parquet.gz
validate task.3_1.parquet "$DATA"/expected/task.3_1.parquet.gz
validate test1_0.parquet "$DATA"/expected/test1_0.parquet.gz
validate test1_1.parquet "$DATA"/expected/test1_1.parquet.gz
validate test2_0.parquet "$DATA"/expected/test2_0.parquet.gz
validate test2_1.parquet "$DATA"/expected/test2_1.parquet.gz
validate test3_0.parquet "$DATA"/expected/test3_0.parquet.gz
validate test3_1.parquet "$DATA"/expected/test3_1.parquet.gz
2 changes: 1 addition & 1 deletion tests/tests/python-universal-resize/settings.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ NUM_DESIRED_OUTPUTS=0
# the default is --yaml. we don't want that
source_from=" "

up_args='-e max_rows_per_table=125 <(gunzip -c "$TEST_PATH"/pail/test-data/input/test1.parquet.gz) <(gunzip -c "$TEST_PATH"/pail/test-data/input/test2.parquet.gz) <(gunzip -c "$TEST_PATH"/pail/test-data/input/test3.parquet.gz)'
up_args='-e max_rows_per_table=125 --gunzip "$TEST_PATH"/pail/test-data/input/test1.parquet.gz "$TEST_PATH"/pail/test-data/input/test2.parquet.gz "$TEST_PATH"/pail/test-data/input/test3.parquet.gz'

0 comments on commit 991a52c

Please sign in to comment.