diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 8f4a9bab6..030f28405 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -33,9 +33,7 @@ jobs: - ./tests/bin/ci.sh -i 'test7(b.*|c.*|d.*|e.*|g.*|h.*)' - ./tests/bin/ci.sh -i 'test7f.*' - ./tests/bin/ci.sh -i 'test8.*' - - ./tests/bin/ci.sh -i 'python-code.*' - - ./tests/bin/ci.sh -i 'python-language.*' - - ./tests/bin/ci.sh -i 'python-universal.*' + - /tmp/lunchpail bat demos/data-prep-kit --concurrency 1 --auto-clean --target=$LUNCHPAIL_TARGET # bat=Build and Test - ./tests/bin/go.sh - ./tests/bin/pipelines.sh os: [ubuntu-latest] diff --git a/cmd/options/build.go b/cmd/options/build.go index 6c240663c..e7dde3da5 100644 --- a/cmd/options/build.go +++ b/cmd/options/build.go @@ -39,6 +39,7 @@ func AddBuildOptions(cmd *cobra.Command) (*build.Options, error) { cmd.Flags().IntVar(&options.Pack, "pack", options.Pack, "Run k concurrent tasks; if k=0 and machine has N cores, then k=N") cmd.Flags().BoolVarP(&options.Gunzip, "gunzip", "z", options.Gunzip, "Gunzip inputs before passing them to the worker logic") + cmd.Flags().BoolVar(&options.AutoClean, "auto-clean", options.AutoClean, "Clean up any caches prior to exiting") AddTargetOptionsTo(cmd, &options) AddLogOptionsTo(cmd, &options) diff --git a/cmd/subcommands/bat.go b/cmd/subcommands/bat.go new file mode 100644 index 000000000..e365471b6 --- /dev/null +++ b/cmd/subcommands/bat.go @@ -0,0 +1,43 @@ +//go:build full || manage + +package subcommands + +import ( + "context" + + "github.com/spf13/cobra" + + "lunchpail.io/cmd/options" + "lunchpail.io/pkg/be" + "lunchpail.io/pkg/boot" +) + +func init() { + var cmd = &cobra.Command{ + Use: "bat", + Short: "Build and test", + Long: "Build and test", + Args: cobra.MatchAll(cobra.MinimumNArgs(1), cobra.OnlyValidArgs), + } + + buildOpts, err := options.AddBuildOptions(cmd) + if err != nil { + panic(err) + } + + concurrency := 4 + cmd.Flags().IntVarP(&concurrency, "concurrency", "j", concurrency, "Maximum tests to run concurrently") + + cmd.RunE = func(cmd *cobra.Command, args []string) error { + ctx := context.Background() + + backend, err := be.NewInitOk(ctx, true, *buildOpts) + if err != nil { + return err + } + + return boot.BuildAndTester{Backend: backend, Concurrency: concurrency, Options: *buildOpts}.RunAll(ctx, args) + } + + rootCmd.AddCommand(cmd) +} diff --git a/cmd/subcommands/tester.go b/cmd/subcommands/tester.go index c062e0989..188617b80 100644 --- a/cmd/subcommands/tester.go +++ b/cmd/subcommands/tester.go @@ -9,7 +9,6 @@ import ( "lunchpail.io/cmd/options" "lunchpail.io/pkg/be" - "lunchpail.io/pkg/be/target" "lunchpail.io/pkg/boot" "lunchpail.io/pkg/build" ) @@ -27,16 +26,19 @@ func init() { panic(err) } + quiet := false + cmd.Flags().BoolVarP(&quiet, "quiet", "q", quiet, "Do not show stdout of application being tested") + cmd.RunE = func(cmd *cobra.Command, args []string) error { ctx := context.Background() - buildOpts.Target.Platform = target.Local - backend, err := be.New(ctx, *buildOpts) + buildOpts.CreateNamespace = true + backend, err := be.NewInitOk(ctx, true, *buildOpts) if err != nil { return err } - return boot.Tester{Backend: backend, Options: *buildOpts}.RunAll(ctx) + return boot.Tester{Quiet: quiet, Backend: backend, Options: *buildOpts}.RunAll(ctx) } rootCmd.AddCommand(cmd) diff --git a/cmd/subcommands/up.go b/cmd/subcommands/up.go index 5d0605000..8302f2816 100644 --- a/cmd/subcommands/up.go +++ b/cmd/subcommands/up.go @@ -68,7 +68,8 @@ func newUpCmd() *cobra.Command { return err } - return boot.Up(ctx, backend, boot.UpOptions{BuildOptions: *buildOpts, DryRun: dryrunFlag, Watch: watchFlag, Inputs: args, Executable: os.Args[0], NoRedirect: noRedirect}) + _, err = boot.Up(ctx, backend, boot.UpOptions{BuildOptions: *buildOpts, DryRun: dryrunFlag, Watch: watchFlag, WatchUtil: watchFlag, Inputs: args, Executable: os.Args[0], NoRedirect: noRedirect}) + return err } return cmd diff --git a/demos/data-prep-kit/README.md b/demos/data-prep-kit/README.md new file mode 100644 index 000000000..aaa75a8bf --- /dev/null +++ b/demos/data-prep-kit/README.md @@ -0,0 +1,3 @@ +# Lunchpail data-prep-kit Demos + +These are ports of most of the transforms from the [data-prep-kit](https://github.com/IBM/data-prep-kit). diff --git a/tests/tests/python-code-code-quality/pail/.helmignore b/demos/data-prep-kit/code/code-quality/pail/.helmignore similarity index 100% rename from tests/tests/python-code-code-quality/pail/.helmignore rename to demos/data-prep-kit/code/code-quality/pail/.helmignore diff --git a/tests/tests/python-code-code-quality/pail/requirements.txt b/demos/data-prep-kit/code/code-quality/requirements.txt similarity index 100% rename from tests/tests/python-code-code-quality/pail/requirements.txt rename to demos/data-prep-kit/code/code-quality/requirements.txt diff --git a/tests/tests/python-code-code-quality/pail/src/main.py b/demos/data-prep-kit/code/code-quality/src/main.py similarity index 100% rename from tests/tests/python-code-code-quality/pail/src/main.py rename to demos/data-prep-kit/code/code-quality/src/main.py diff --git a/tests/tests/python-code-code-quality/pail/test-data/expected/metadata.json.gz b/demos/data-prep-kit/code/code-quality/test-data/expected/metadata.json.gz similarity index 100% rename from tests/tests/python-code-code-quality/pail/test-data/expected/metadata.json.gz rename to demos/data-prep-kit/code/code-quality/test-data/expected/metadata.json.gz diff --git a/tests/tests/python-code-code-quality/pail/test-data/expected/sample_1.parquet.gz b/demos/data-prep-kit/code/code-quality/test-data/expected/sample_1.parquet.gz similarity index 100% rename from tests/tests/python-code-code-quality/pail/test-data/expected/sample_1.parquet.gz rename to demos/data-prep-kit/code/code-quality/test-data/expected/sample_1.parquet.gz diff --git a/tests/tests/python-code-code-quality/pail/test-data/expected/sample_2.parquet.gz b/demos/data-prep-kit/code/code-quality/test-data/expected/sample_2.parquet.gz similarity index 100% rename from tests/tests/python-code-code-quality/pail/test-data/expected/sample_2.parquet.gz rename to demos/data-prep-kit/code/code-quality/test-data/expected/sample_2.parquet.gz diff --git a/tests/tests/python-code-code-quality/pail/test-data/input/sample_1.parquet.gz b/demos/data-prep-kit/code/code-quality/test-data/input/sample_1.parquet.gz similarity index 100% rename from tests/tests/python-code-code-quality/pail/test-data/input/sample_1.parquet.gz rename to demos/data-prep-kit/code/code-quality/test-data/input/sample_1.parquet.gz diff --git a/tests/tests/python-code-code-quality/pail/test-data/input/sample_2.parquet.gz b/demos/data-prep-kit/code/code-quality/test-data/input/sample_2.parquet.gz similarity index 100% rename from tests/tests/python-code-code-quality/pail/test-data/input/sample_2.parquet.gz rename to demos/data-prep-kit/code/code-quality/test-data/input/sample_2.parquet.gz diff --git a/tests/tests/python-code-code2parquet/pail/blobs/base64/languages/lang_extensions.json b/demos/data-prep-kit/code/code2parquet/blobs/base64/languages/lang_extensions.json similarity index 100% rename from tests/tests/python-code-code2parquet/pail/blobs/base64/languages/lang_extensions.json rename to demos/data-prep-kit/code/code2parquet/blobs/base64/languages/lang_extensions.json diff --git a/tests/tests/python-code-code2parquet/pail/env.yaml b/demos/data-prep-kit/code/code2parquet/env.yaml similarity index 100% rename from tests/tests/python-code-code2parquet/pail/env.yaml rename to demos/data-prep-kit/code/code2parquet/env.yaml diff --git a/tests/tests/python-code-code2parquet/pail/requirements.txt b/demos/data-prep-kit/code/code2parquet/requirements.txt similarity index 100% rename from tests/tests/python-code-code2parquet/pail/requirements.txt rename to demos/data-prep-kit/code/code2parquet/requirements.txt diff --git a/tests/tests/python-code-code2parquet/pail/src/main.py b/demos/data-prep-kit/code/code2parquet/src/main.py similarity index 100% rename from tests/tests/python-code-code2parquet/pail/src/main.py rename to demos/data-prep-kit/code/code2parquet/src/main.py diff --git a/tests/tests/python-code-code2parquet/pail/test-data/expected/application-java.parquet.gz b/demos/data-prep-kit/code/code2parquet/test-data/expected/application-java.zip.gz similarity index 100% rename from tests/tests/python-code-code2parquet/pail/test-data/expected/application-java.parquet.gz rename to demos/data-prep-kit/code/code2parquet/test-data/expected/application-java.zip.gz diff --git a/tests/tests/python-code-code2parquet/pail/test-data/expected/data-processing-lib.parquet.gz b/demos/data-prep-kit/code/code2parquet/test-data/expected/data-processing-lib.zip.gz similarity index 100% rename from tests/tests/python-code-code2parquet/pail/test-data/expected/data-processing-lib.parquet.gz rename to demos/data-prep-kit/code/code2parquet/test-data/expected/data-processing-lib.zip.gz diff --git a/tests/tests/python-code-code2parquet/pail/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet.gz b/demos/data-prep-kit/code/code2parquet/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.zip.gz similarity index 100% rename from tests/tests/python-code-code2parquet/pail/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet.gz rename to demos/data-prep-kit/code/code2parquet/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.zip.gz diff --git a/tests/tests/python-code-code2parquet/pail/test-data/expected/metadata.json.gz b/demos/data-prep-kit/code/code2parquet/test-data/expected/metadata.json.gz similarity index 100% rename from tests/tests/python-code-code2parquet/pail/test-data/expected/metadata.json.gz rename to demos/data-prep-kit/code/code2parquet/test-data/expected/metadata.json.gz diff --git a/tests/tests/python-code-code2parquet/pail/test-data/input/application-java.zip b/demos/data-prep-kit/code/code2parquet/test-data/input/application-java.zip similarity index 100% rename from tests/tests/python-code-code2parquet/pail/test-data/input/application-java.zip rename to demos/data-prep-kit/code/code2parquet/test-data/input/application-java.zip diff --git a/tests/tests/python-code-code2parquet/pail/test-data/input/data-processing-lib.zip b/demos/data-prep-kit/code/code2parquet/test-data/input/data-processing-lib.zip similarity index 100% rename from tests/tests/python-code-code2parquet/pail/test-data/input/data-processing-lib.zip rename to demos/data-prep-kit/code/code2parquet/test-data/input/data-processing-lib.zip diff --git a/tests/tests/python-code-code2parquet/pail/test-data/input/https___github.com_00000o1_environments_archive_refs_heads_master.zip b/demos/data-prep-kit/code/code2parquet/test-data/input/https___github.com_00000o1_environments_archive_refs_heads_master.zip similarity index 100% rename from tests/tests/python-code-code2parquet/pail/test-data/input/https___github.com_00000o1_environments_archive_refs_heads_master.zip rename to demos/data-prep-kit/code/code2parquet/test-data/input/https___github.com_00000o1_environments_archive_refs_heads_master.zip diff --git a/tests/tests/python-code-header-cleanser/pail/command b/demos/data-prep-kit/code/header-cleanser/command similarity index 100% rename from tests/tests/python-code-header-cleanser/pail/command rename to demos/data-prep-kit/code/header-cleanser/command diff --git a/tests/tests/python-code-header-cleanser/pail/image b/demos/data-prep-kit/code/header-cleanser/image similarity index 100% rename from tests/tests/python-code-header-cleanser/pail/image rename to demos/data-prep-kit/code/header-cleanser/image diff --git a/tests/tests/python-code-header-cleanser/pail/requirements.txt b/demos/data-prep-kit/code/header-cleanser/requirements.txt similarity index 80% rename from tests/tests/python-code-header-cleanser/pail/requirements.txt rename to demos/data-prep-kit/code/header-cleanser/requirements.txt index b9b22a669..1135c6353 100644 --- a/tests/tests/python-code-header-cleanser/pail/requirements.txt +++ b/demos/data-prep-kit/code/header-cleanser/requirements.txt @@ -1,5 +1,5 @@ data-prep-toolkit==0.2.2.dev1 -scancode-toolkit ; platform_system != 'Darwin' +scancode-toolkit-mini # we can probably update to 18+, but we will have to re-generate expected output as pyarrow 18 seems to have resulted in a binary format change pyarrow<17 diff --git a/tests/tests/python-code-header-cleanser/pail/src/header_cleanser_transform.py b/demos/data-prep-kit/code/header-cleanser/src/header_cleanser_transform.py similarity index 100% rename from tests/tests/python-code-header-cleanser/pail/src/header_cleanser_transform.py rename to demos/data-prep-kit/code/header-cleanser/src/header_cleanser_transform.py diff --git a/tests/tests/python-code-header-cleanser/pail/src/main.py b/demos/data-prep-kit/code/header-cleanser/src/main.py similarity index 100% rename from tests/tests/python-code-header-cleanser/pail/src/main.py rename to demos/data-prep-kit/code/header-cleanser/src/main.py diff --git a/tests/tests/python-code-header-cleanser/pail/test-data/expected/test1.parquet.gz b/demos/data-prep-kit/code/header-cleanser/test-data/NOVALIDATE-test1.parquet.gz similarity index 100% rename from tests/tests/python-code-header-cleanser/pail/test-data/expected/test1.parquet.gz rename to demos/data-prep-kit/code/header-cleanser/test-data/NOVALIDATE-test1.parquet.gz diff --git a/tests/tests/python-code-header-cleanser/pail/test-data/input/test1.parquet.gz b/demos/data-prep-kit/code/header-cleanser/test-data/input/test1.parquet.gz similarity index 100% rename from tests/tests/python-code-header-cleanser/pail/test-data/input/test1.parquet.gz rename to demos/data-prep-kit/code/header-cleanser/test-data/input/test1.parquet.gz diff --git a/tests/tests/python-language-doc-chunk/README.md b/demos/data-prep-kit/language/doc-chunk/README.md similarity index 100% rename from tests/tests/python-language-doc-chunk/README.md rename to demos/data-prep-kit/language/doc-chunk/README.md diff --git a/tests/tests/python-language-doc-chunk/pail/requirements.txt b/demos/data-prep-kit/language/doc-chunk/requirements.txt similarity index 100% rename from tests/tests/python-language-doc-chunk/pail/requirements.txt rename to demos/data-prep-kit/language/doc-chunk/requirements.txt diff --git a/tests/tests/python-language-doc-chunk/pail/src/doc_chunk_chunkers.py b/demos/data-prep-kit/language/doc-chunk/src/doc_chunk_chunkers.py similarity index 100% rename from tests/tests/python-language-doc-chunk/pail/src/doc_chunk_chunkers.py rename to demos/data-prep-kit/language/doc-chunk/src/doc_chunk_chunkers.py diff --git a/tests/tests/python-language-doc-chunk/pail/src/main.py b/demos/data-prep-kit/language/doc-chunk/src/main.py similarity index 100% rename from tests/tests/python-language-doc-chunk/pail/src/main.py rename to demos/data-prep-kit/language/doc-chunk/src/main.py diff --git a/tests/tests/python-language-doc-chunk/pail/test-data/expected/metadata.json.gz b/demos/data-prep-kit/language/doc-chunk/test-data/expected/metadata.json.gz similarity index 100% rename from tests/tests/python-language-doc-chunk/pail/test-data/expected/metadata.json.gz rename to demos/data-prep-kit/language/doc-chunk/test-data/expected/metadata.json.gz diff --git a/tests/tests/python-language-doc-chunk/pail/test-data/expected/test1_0.parquet.gz b/demos/data-prep-kit/language/doc-chunk/test-data/expected/test1_0.parquet.gz similarity index 100% rename from tests/tests/python-language-doc-chunk/pail/test-data/expected/test1_0.parquet.gz rename to demos/data-prep-kit/language/doc-chunk/test-data/expected/test1_0.parquet.gz diff --git a/tests/tests/python-language-doc-chunk/pail/test-data/expected_md/2206.01062.parquet.gz b/demos/data-prep-kit/language/doc-chunk/test-data/expected_md/2206.01062.parquet.gz similarity index 100% rename from tests/tests/python-language-doc-chunk/pail/test-data/expected_md/2206.01062.parquet.gz rename to demos/data-prep-kit/language/doc-chunk/test-data/expected_md/2206.01062.parquet.gz diff --git a/tests/tests/python-language-doc-chunk/pail/test-data/expected_md/metadata.json.gz b/demos/data-prep-kit/language/doc-chunk/test-data/expected_md/metadata.json.gz similarity index 100% rename from tests/tests/python-language-doc-chunk/pail/test-data/expected_md/metadata.json.gz rename to demos/data-prep-kit/language/doc-chunk/test-data/expected_md/metadata.json.gz diff --git a/tests/tests/python-language-doc-chunk/pail/test-data/input/test1.parquet.gz b/demos/data-prep-kit/language/doc-chunk/test-data/input/test1.parquet.gz similarity index 100% rename from tests/tests/python-language-doc-chunk/pail/test-data/input/test1.parquet.gz rename to demos/data-prep-kit/language/doc-chunk/test-data/input/test1.parquet.gz diff --git a/tests/tests/python-language-doc-chunk/pail/test-data/input_md/2206.01062.parquet.gz b/demos/data-prep-kit/language/doc-chunk/test-data/input_md/2206.01062.parquet.gz similarity index 100% rename from tests/tests/python-language-doc-chunk/pail/test-data/input_md/2206.01062.parquet.gz rename to demos/data-prep-kit/language/doc-chunk/test-data/input_md/2206.01062.parquet.gz diff --git a/tests/tests/python-language-doc-quality/README.md b/demos/data-prep-kit/language/doc-quality/README.md similarity index 100% rename from tests/tests/python-language-doc-quality/README.md rename to demos/data-prep-kit/language/doc-quality/README.md diff --git a/tests/tests/python-language-doc-quality/pail/blobs/plain/ldnoobw/en b/demos/data-prep-kit/language/doc-quality/blobs/plain/ldnoobw/en similarity index 100% rename from tests/tests/python-language-doc-quality/pail/blobs/plain/ldnoobw/en rename to demos/data-prep-kit/language/doc-quality/blobs/plain/ldnoobw/en diff --git a/tests/tests/python-language-doc-quality/pail/requirements.txt b/demos/data-prep-kit/language/doc-quality/requirements.txt similarity index 100% rename from tests/tests/python-language-doc-quality/pail/requirements.txt rename to demos/data-prep-kit/language/doc-quality/requirements.txt diff --git a/tests/tests/python-language-doc-quality/pail/src/cc_net_prepro.py b/demos/data-prep-kit/language/doc-quality/src/cc_net_prepro.py similarity index 100% rename from tests/tests/python-language-doc-quality/pail/src/cc_net_prepro.py rename to demos/data-prep-kit/language/doc-quality/src/cc_net_prepro.py diff --git a/tests/tests/python-language-doc-quality/pail/src/doc_Gopher_statistics.py b/demos/data-prep-kit/language/doc-quality/src/doc_Gopher_statistics.py similarity index 100% rename from tests/tests/python-language-doc-quality/pail/src/doc_Gopher_statistics.py rename to demos/data-prep-kit/language/doc-quality/src/doc_Gopher_statistics.py diff --git a/tests/tests/python-language-doc-quality/pail/src/doc_c4_statistics.py b/demos/data-prep-kit/language/doc-quality/src/doc_c4_statistics.py similarity index 100% rename from tests/tests/python-language-doc-quality/pail/src/doc_c4_statistics.py rename to demos/data-prep-kit/language/doc-quality/src/doc_c4_statistics.py diff --git a/tests/tests/python-language-doc-quality/pail/src/doc_quality_utils.py b/demos/data-prep-kit/language/doc-quality/src/doc_quality_utils.py similarity index 100% rename from tests/tests/python-language-doc-quality/pail/src/doc_quality_utils.py rename to demos/data-prep-kit/language/doc-quality/src/doc_quality_utils.py diff --git a/tests/tests/python-language-doc-quality/pail/src/main.py b/demos/data-prep-kit/language/doc-quality/src/main.py similarity index 100% rename from tests/tests/python-language-doc-quality/pail/src/main.py rename to demos/data-prep-kit/language/doc-quality/src/main.py diff --git a/tests/tests/python-language-doc-quality/pail/test-data/expected/metadata.json.gz b/demos/data-prep-kit/language/doc-quality/test-data/expected/metadata.json.gz similarity index 100% rename from tests/tests/python-language-doc-quality/pail/test-data/expected/metadata.json.gz rename to demos/data-prep-kit/language/doc-quality/test-data/expected/metadata.json.gz diff --git a/tests/tests/python-language-doc-quality/pail/test-data/expected/test1.parquet.gz b/demos/data-prep-kit/language/doc-quality/test-data/expected/test1.parquet.gz similarity index 100% rename from tests/tests/python-language-doc-quality/pail/test-data/expected/test1.parquet.gz rename to demos/data-prep-kit/language/doc-quality/test-data/expected/test1.parquet.gz diff --git a/tests/tests/python-language-doc-quality/pail/test-data/input/test1.parquet.gz b/demos/data-prep-kit/language/doc-quality/test-data/input/test1.parquet.gz similarity index 100% rename from tests/tests/python-language-doc-quality/pail/test-data/input/test1.parquet.gz rename to demos/data-prep-kit/language/doc-quality/test-data/input/test1.parquet.gz diff --git a/tests/tests/python-language-html2parquet/README.md b/demos/data-prep-kit/language/html2parquet/README.md similarity index 100% rename from tests/tests/python-language-html2parquet/README.md rename to demos/data-prep-kit/language/html2parquet/README.md diff --git a/tests/tests/python-language-html2parquet/pail/requirements.txt b/demos/data-prep-kit/language/html2parquet/requirements.txt similarity index 100% rename from tests/tests/python-language-html2parquet/pail/requirements.txt rename to demos/data-prep-kit/language/html2parquet/requirements.txt diff --git a/tests/tests/python-language-html2parquet/pail/src/main.py b/demos/data-prep-kit/language/html2parquet/src/main.py similarity index 100% rename from tests/tests/python-language-html2parquet/pail/src/main.py rename to demos/data-prep-kit/language/html2parquet/src/main.py diff --git a/tests/tests/python-language-html2parquet/pail/test-data/expected/html_zip.parquet.gz b/demos/data-prep-kit/language/html2parquet/test-data/expected/html_zip.zip.gz similarity index 100% rename from tests/tests/python-language-html2parquet/pail/test-data/expected/html_zip.parquet.gz rename to demos/data-prep-kit/language/html2parquet/test-data/expected/html_zip.zip.gz diff --git a/tests/tests/python-language-html2parquet/pail/test-data/expected/metadata.json.gz b/demos/data-prep-kit/language/html2parquet/test-data/expected/metadata.json.gz similarity index 100% rename from tests/tests/python-language-html2parquet/pail/test-data/expected/metadata.json.gz rename to demos/data-prep-kit/language/html2parquet/test-data/expected/metadata.json.gz diff --git a/tests/tests/python-language-html2parquet/pail/test-data/expected/test1.parquet.gz b/demos/data-prep-kit/language/html2parquet/test-data/expected/test1.html.gz similarity index 100% rename from tests/tests/python-language-html2parquet/pail/test-data/expected/test1.parquet.gz rename to demos/data-prep-kit/language/html2parquet/test-data/expected/test1.html.gz diff --git a/tests/tests/python-language-html2parquet/pail/test-data/input/html_zip.zip b/demos/data-prep-kit/language/html2parquet/test-data/input/html_zip.zip similarity index 100% rename from tests/tests/python-language-html2parquet/pail/test-data/input/html_zip.zip rename to demos/data-prep-kit/language/html2parquet/test-data/input/html_zip.zip diff --git a/tests/tests/python-language-html2parquet/pail/test-data/input/test1.html b/demos/data-prep-kit/language/html2parquet/test-data/input/test1.html similarity index 100% rename from tests/tests/python-language-html2parquet/pail/test-data/input/test1.html rename to demos/data-prep-kit/language/html2parquet/test-data/input/test1.html diff --git a/tests/tests/python-language-lang-id/README.md b/demos/data-prep-kit/language/lang-id/README.md similarity index 100% rename from tests/tests/python-language-lang-id/README.md rename to demos/data-prep-kit/language/lang-id/README.md diff --git a/tests/tests/python-language-lang-id/pail/memory b/demos/data-prep-kit/language/lang-id/memory similarity index 100% rename from tests/tests/python-language-lang-id/pail/memory rename to demos/data-prep-kit/language/lang-id/memory diff --git a/tests/tests/python-language-lang-id/pail/requirements.txt b/demos/data-prep-kit/language/lang-id/requirements.txt similarity index 100% rename from tests/tests/python-language-lang-id/pail/requirements.txt rename to demos/data-prep-kit/language/lang-id/requirements.txt diff --git a/tests/tests/python-language-lang-id/pail/src/lang_models.py b/demos/data-prep-kit/language/lang-id/src/lang_models.py similarity index 100% rename from tests/tests/python-language-lang-id/pail/src/lang_models.py rename to demos/data-prep-kit/language/lang-id/src/lang_models.py diff --git a/tests/tests/python-language-lang-id/pail/src/main.py b/demos/data-prep-kit/language/lang-id/src/main.py similarity index 100% rename from tests/tests/python-language-lang-id/pail/src/main.py rename to demos/data-prep-kit/language/lang-id/src/main.py diff --git a/tests/tests/python-language-lang-id/pail/src/nlp.py b/demos/data-prep-kit/language/lang-id/src/nlp.py similarity index 100% rename from tests/tests/python-language-lang-id/pail/src/nlp.py rename to demos/data-prep-kit/language/lang-id/src/nlp.py diff --git a/tests/tests/python-language-lang-id/pail/test-data/sm/expected/metadata.json.gz b/demos/data-prep-kit/language/lang-id/test-data/expected/metadata.json.gz similarity index 100% rename from tests/tests/python-language-lang-id/pail/test-data/sm/expected/metadata.json.gz rename to demos/data-prep-kit/language/lang-id/test-data/expected/metadata.json.gz diff --git a/tests/tests/python-language-lang-id/pail/test-data/sm/expected/test_01.parquet.gz b/demos/data-prep-kit/language/lang-id/test-data/expected/test_01.parquet.gz similarity index 100% rename from tests/tests/python-language-lang-id/pail/test-data/sm/expected/test_01.parquet.gz rename to demos/data-prep-kit/language/lang-id/test-data/expected/test_01.parquet.gz diff --git a/tests/tests/python-language-lang-id/pail/test-data/sm/expected/test_02.parquet.gz b/demos/data-prep-kit/language/lang-id/test-data/expected/test_02.parquet.gz similarity index 100% rename from tests/tests/python-language-lang-id/pail/test-data/sm/expected/test_02.parquet.gz rename to demos/data-prep-kit/language/lang-id/test-data/expected/test_02.parquet.gz diff --git a/tests/tests/python-language-lang-id/pail/test-data/sm/expected/test_03.parquet.gz b/demos/data-prep-kit/language/lang-id/test-data/expected/test_03.parquet.gz similarity index 100% rename from tests/tests/python-language-lang-id/pail/test-data/sm/expected/test_03.parquet.gz rename to demos/data-prep-kit/language/lang-id/test-data/expected/test_03.parquet.gz diff --git a/tests/tests/python-language-lang-id/pail/test-data/sm/input/test_01.parquet.gz b/demos/data-prep-kit/language/lang-id/test-data/input/test_01.parquet.gz similarity index 100% rename from tests/tests/python-language-lang-id/pail/test-data/sm/input/test_01.parquet.gz rename to demos/data-prep-kit/language/lang-id/test-data/input/test_01.parquet.gz diff --git a/tests/tests/python-language-lang-id/pail/test-data/sm/input/test_02.parquet.gz b/demos/data-prep-kit/language/lang-id/test-data/input/test_02.parquet.gz similarity index 100% rename from tests/tests/python-language-lang-id/pail/test-data/sm/input/test_02.parquet.gz rename to demos/data-prep-kit/language/lang-id/test-data/input/test_02.parquet.gz diff --git a/tests/tests/python-language-lang-id/pail/test-data/sm/input/test_03.parquet.gz b/demos/data-prep-kit/language/lang-id/test-data/input/test_03.parquet.gz similarity index 100% rename from tests/tests/python-language-lang-id/pail/test-data/sm/input/test_03.parquet.gz rename to demos/data-prep-kit/language/lang-id/test-data/input/test_03.parquet.gz diff --git a/tests/tests/python-language-pdf2parquet/README.md b/demos/data-prep-kit/language/pdf2parquet/README.md similarity index 100% rename from tests/tests/python-language-pdf2parquet/README.md rename to demos/data-prep-kit/language/pdf2parquet/README.md diff --git a/tests/tests/python-language-pdf2parquet/pail/command b/demos/data-prep-kit/language/pdf2parquet/command similarity index 100% rename from tests/tests/python-language-pdf2parquet/pail/command rename to demos/data-prep-kit/language/pdf2parquet/command diff --git a/tests/tests/python-language-pdf2parquet/pail/env.yaml b/demos/data-prep-kit/language/pdf2parquet/env.yaml similarity index 100% rename from tests/tests/python-language-pdf2parquet/pail/env.yaml rename to demos/data-prep-kit/language/pdf2parquet/env.yaml diff --git a/tests/tests/python-language-pdf2parquet/pail/image b/demos/data-prep-kit/language/pdf2parquet/image similarity index 100% rename from tests/tests/python-language-pdf2parquet/pail/image rename to demos/data-prep-kit/language/pdf2parquet/image diff --git a/tests/tests/python-language-pdf2parquet/pail/requirements.txt b/demos/data-prep-kit/language/pdf2parquet/requirements.txt similarity index 100% rename from tests/tests/python-language-pdf2parquet/pail/requirements.txt rename to demos/data-prep-kit/language/pdf2parquet/requirements.txt diff --git a/tests/tests/python-language-pdf2parquet/pail/src/main.py b/demos/data-prep-kit/language/pdf2parquet/src/main.py similarity index 100% rename from tests/tests/python-language-pdf2parquet/pail/src/main.py rename to demos/data-prep-kit/language/pdf2parquet/src/main.py diff --git a/tests/tests/python-language-pdf2parquet/pail/test-data/expected/archive1.parquet.gz b/demos/data-prep-kit/language/pdf2parquet/test-data/expected/archive1.zip.gz similarity index 100% rename from tests/tests/python-language-pdf2parquet/pail/test-data/expected/archive1.parquet.gz rename to demos/data-prep-kit/language/pdf2parquet/test-data/expected/archive1.zip.gz diff --git a/tests/tests/python-language-pdf2parquet/pail/test-data/expected/metadata.json.gz b/demos/data-prep-kit/language/pdf2parquet/test-data/expected/metadata.json.gz similarity index 100% rename from tests/tests/python-language-pdf2parquet/pail/test-data/expected/metadata.json.gz rename to demos/data-prep-kit/language/pdf2parquet/test-data/expected/metadata.json.gz diff --git a/tests/tests/python-language-pdf2parquet/pail/test-data/expected/redp5110-ch1.parquet.gz b/demos/data-prep-kit/language/pdf2parquet/test-data/expected/redp5110-ch1.pdf.gz similarity index 100% rename from tests/tests/python-language-pdf2parquet/pail/test-data/expected/redp5110-ch1.parquet.gz rename to demos/data-prep-kit/language/pdf2parquet/test-data/expected/redp5110-ch1.pdf.gz diff --git a/tests/tests/python-language-pdf2parquet/pail/test-data/input/archive1.zip b/demos/data-prep-kit/language/pdf2parquet/test-data/input/archive1.zip similarity index 100% rename from tests/tests/python-language-pdf2parquet/pail/test-data/input/archive1.zip rename to demos/data-prep-kit/language/pdf2parquet/test-data/input/archive1.zip diff --git a/tests/tests/python-language-pdf2parquet/pail/test-data/input/redp5110-ch1.pdf b/demos/data-prep-kit/language/pdf2parquet/test-data/input/redp5110-ch1.pdf similarity index 100% rename from tests/tests/python-language-pdf2parquet/pail/test-data/input/redp5110-ch1.pdf rename to demos/data-prep-kit/language/pdf2parquet/test-data/input/redp5110-ch1.pdf diff --git a/tests/tests/python-language-pii-redactor/README.md b/demos/data-prep-kit/language/pii-redactor/README.md similarity index 100% rename from tests/tests/python-language-pii-redactor/README.md rename to demos/data-prep-kit/language/pii-redactor/README.md diff --git a/tests/tests/python-language-pii-redactor/pail/command b/demos/data-prep-kit/language/pii-redactor/command similarity index 100% rename from tests/tests/python-language-pii-redactor/pail/command rename to demos/data-prep-kit/language/pii-redactor/command diff --git a/tests/tests/python-language-pii-redactor/pail/image b/demos/data-prep-kit/language/pii-redactor/image similarity index 100% rename from tests/tests/python-language-pii-redactor/pail/image rename to demos/data-prep-kit/language/pii-redactor/image diff --git a/tests/tests/python-language-pii-redactor/pail/memory b/demos/data-prep-kit/language/pii-redactor/memory similarity index 100% rename from tests/tests/python-language-pii-redactor/pail/memory rename to demos/data-prep-kit/language/pii-redactor/memory diff --git a/tests/tests/python-language-pii-redactor/pail/requirements.txt b/demos/data-prep-kit/language/pii-redactor/requirements.txt similarity index 100% rename from tests/tests/python-language-pii-redactor/pail/requirements.txt rename to demos/data-prep-kit/language/pii-redactor/requirements.txt diff --git a/tests/tests/python-language-pii-redactor/pail/src/flair_recognizer.py b/demos/data-prep-kit/language/pii-redactor/src/flair_recognizer.py similarity index 100% rename from tests/tests/python-language-pii-redactor/pail/src/flair_recognizer.py rename to demos/data-prep-kit/language/pii-redactor/src/flair_recognizer.py diff --git a/tests/tests/python-language-pii-redactor/pail/src/main.py b/demos/data-prep-kit/language/pii-redactor/src/main.py similarity index 100% rename from tests/tests/python-language-pii-redactor/pail/src/main.py rename to demos/data-prep-kit/language/pii-redactor/src/main.py diff --git a/tests/tests/python-language-pii-redactor/pail/src/pii_analyzer.py b/demos/data-prep-kit/language/pii-redactor/src/pii_analyzer.py similarity index 100% rename from tests/tests/python-language-pii-redactor/pail/src/pii_analyzer.py rename to demos/data-prep-kit/language/pii-redactor/src/pii_analyzer.py diff --git a/tests/tests/python-language-pii-redactor/pail/src/pii_anonymizer.py b/demos/data-prep-kit/language/pii-redactor/src/pii_anonymizer.py similarity index 100% rename from tests/tests/python-language-pii-redactor/pail/src/pii_anonymizer.py rename to demos/data-prep-kit/language/pii-redactor/src/pii_anonymizer.py diff --git a/tests/tests/python-language-pii-redactor/pail/test-data/xs/1.expected.parquet.gz b/demos/data-prep-kit/language/pii-redactor/test-data/expected/xs.parquet.gz similarity index 100% rename from tests/tests/python-language-pii-redactor/pail/test-data/xs/1.expected.parquet.gz rename to demos/data-prep-kit/language/pii-redactor/test-data/expected/xs.parquet.gz diff --git a/tests/tests/python-language-pii-redactor/pail/test-data/xs/1.parquet.gz b/demos/data-prep-kit/language/pii-redactor/test-data/input/xs.parquet.gz similarity index 100% rename from tests/tests/python-language-pii-redactor/pail/test-data/xs/1.parquet.gz rename to demos/data-prep-kit/language/pii-redactor/test-data/input/xs.parquet.gz diff --git a/demos/data-prep-kit/language/pii-redactor/test-data/sm/expected/sm.parquet.gz b/demos/data-prep-kit/language/pii-redactor/test-data/sm/expected/sm.parquet.gz new file mode 100644 index 000000000..c26672585 Binary files /dev/null and b/demos/data-prep-kit/language/pii-redactor/test-data/sm/expected/sm.parquet.gz differ diff --git a/tests/tests/python-language-pii-redactor/pail/test-data/sm/pii_test_data.parquet.gz b/demos/data-prep-kit/language/pii-redactor/test-data/sm/input/sm.parquet.gz similarity index 100% rename from tests/tests/python-language-pii-redactor/pail/test-data/sm/pii_test_data.parquet.gz rename to demos/data-prep-kit/language/pii-redactor/test-data/sm/input/sm.parquet.gz diff --git a/tests/tests/python-language-pii-redactor/pail/test-data/xs/xs.py b/demos/data-prep-kit/language/pii-redactor/test-data/xs.py similarity index 100% rename from tests/tests/python-language-pii-redactor/pail/test-data/xs/xs.py rename to demos/data-prep-kit/language/pii-redactor/test-data/xs.py diff --git a/tests/tests/python-language-text-encoder/README.md b/demos/data-prep-kit/language/text-encoder/README.md similarity index 100% rename from tests/tests/python-language-text-encoder/README.md rename to demos/data-prep-kit/language/text-encoder/README.md diff --git a/tests/tests/python-language-text-encoder/pail/command b/demos/data-prep-kit/language/text-encoder/command similarity index 100% rename from tests/tests/python-language-text-encoder/pail/command rename to demos/data-prep-kit/language/text-encoder/command diff --git a/tests/tests/python-language-text-encoder/pail/image b/demos/data-prep-kit/language/text-encoder/image similarity index 100% rename from tests/tests/python-language-text-encoder/pail/image rename to demos/data-prep-kit/language/text-encoder/image diff --git a/tests/tests/python-language-text-encoder/pail/requirements.txt b/demos/data-prep-kit/language/text-encoder/requirements.txt similarity index 100% rename from tests/tests/python-language-text-encoder/pail/requirements.txt rename to demos/data-prep-kit/language/text-encoder/requirements.txt diff --git a/tests/tests/python-language-text-encoder/pail/requirements_linux_ci.txt b/demos/data-prep-kit/language/text-encoder/requirements_linux_ci.txt similarity index 100% rename from tests/tests/python-language-text-encoder/pail/requirements_linux_ci.txt rename to demos/data-prep-kit/language/text-encoder/requirements_linux_ci.txt diff --git a/tests/tests/python-language-text-encoder/pail/src/main.py b/demos/data-prep-kit/language/text-encoder/src/main.py similarity index 100% rename from tests/tests/python-language-text-encoder/pail/src/main.py rename to demos/data-prep-kit/language/text-encoder/src/main.py diff --git a/tests/tests/python-language-text-encoder/pail/test-data/expected/test1.parquet.gz b/demos/data-prep-kit/language/text-encoder/test-data/NOVALIDATE-test1.parquet.gz similarity index 100% rename from tests/tests/python-language-text-encoder/pail/test-data/expected/test1.parquet.gz rename to demos/data-prep-kit/language/text-encoder/test-data/NOVALIDATE-test1.parquet.gz diff --git a/tests/tests/python-language-text-encoder/pail/test-data/expected/metadata.json.gz b/demos/data-prep-kit/language/text-encoder/test-data/expected/metadata.json.gz similarity index 100% rename from tests/tests/python-language-text-encoder/pail/test-data/expected/metadata.json.gz rename to demos/data-prep-kit/language/text-encoder/test-data/expected/metadata.json.gz diff --git a/tests/tests/python-language-text-encoder/pail/test-data/input/test1.parquet.gz b/demos/data-prep-kit/language/text-encoder/test-data/input/test1.parquet.gz similarity index 100% rename from tests/tests/python-language-text-encoder/pail/test-data/input/test1.parquet.gz rename to demos/data-prep-kit/language/text-encoder/test-data/input/test1.parquet.gz diff --git a/tests/tests/python-universal-doc-id/pail/command b/demos/data-prep-kit/universal/doc-id/command similarity index 100% rename from tests/tests/python-universal-doc-id/pail/command rename to demos/data-prep-kit/universal/doc-id/command diff --git a/tests/tests/python-universal-doc-id/pail/image b/demos/data-prep-kit/universal/doc-id/image similarity index 100% rename from tests/tests/python-universal-doc-id/pail/image rename to demos/data-prep-kit/universal/doc-id/image diff --git a/tests/tests/python-universal-doc-id/pail/requirements.txt b/demos/data-prep-kit/universal/doc-id/requirements.txt similarity index 100% rename from tests/tests/python-universal-doc-id/pail/requirements.txt rename to demos/data-prep-kit/universal/doc-id/requirements.txt diff --git a/tests/tests/python-universal-doc-id/pail/src/doc_id_transform_base.py b/demos/data-prep-kit/universal/doc-id/src/doc_id_transform_base.py similarity index 100% rename from tests/tests/python-universal-doc-id/pail/src/doc_id_transform_base.py rename to demos/data-prep-kit/universal/doc-id/src/doc_id_transform_base.py diff --git a/tests/tests/python-universal-doc-id/pail/src/doc_id_transform_python.py b/demos/data-prep-kit/universal/doc-id/src/doc_id_transform_python.py similarity index 100% rename from tests/tests/python-universal-doc-id/pail/src/doc_id_transform_python.py rename to demos/data-prep-kit/universal/doc-id/src/doc_id_transform_python.py diff --git a/tests/tests/python-universal-doc-id/pail/src/main.py b/demos/data-prep-kit/universal/doc-id/src/main.py similarity index 100% rename from tests/tests/python-universal-doc-id/pail/src/main.py rename to demos/data-prep-kit/universal/doc-id/src/main.py diff --git a/tests/tests/python-universal-doc-id/pail/test-data/expected/sample1.parquet.gz b/demos/data-prep-kit/universal/doc-id/test-data/NOVALIDATE-sample1.parquet.gz similarity index 100% rename from tests/tests/python-universal-doc-id/pail/test-data/expected/sample1.parquet.gz rename to demos/data-prep-kit/universal/doc-id/test-data/NOVALIDATE-sample1.parquet.gz diff --git a/tests/tests/python-universal-doc-id/pail/test-data/expected/metadata.json.gz b/demos/data-prep-kit/universal/doc-id/test-data/expected/metadata.json.gz similarity index 100% rename from tests/tests/python-universal-doc-id/pail/test-data/expected/metadata.json.gz rename to demos/data-prep-kit/universal/doc-id/test-data/expected/metadata.json.gz diff --git a/tests/tests/python-universal-doc-id/pail/test-data/input/sample1.parquet.gz b/demos/data-prep-kit/universal/doc-id/test-data/input/sample1.parquet.gz similarity index 100% rename from tests/tests/python-universal-doc-id/pail/test-data/input/sample1.parquet.gz rename to demos/data-prep-kit/universal/doc-id/test-data/input/sample1.parquet.gz diff --git a/tests/tests/python-universal-ededup/pail/command b/demos/data-prep-kit/universal/ededup/command similarity index 100% rename from tests/tests/python-universal-ededup/pail/command rename to demos/data-prep-kit/universal/ededup/command diff --git a/tests/tests/python-universal-ededup/pail/image b/demos/data-prep-kit/universal/ededup/image similarity index 100% rename from tests/tests/python-universal-ededup/pail/image rename to demos/data-prep-kit/universal/ededup/image diff --git a/tests/tests/python-universal-ededup/pail/requirements.txt b/demos/data-prep-kit/universal/ededup/requirements.txt similarity index 100% rename from tests/tests/python-universal-ededup/pail/requirements.txt rename to demos/data-prep-kit/universal/ededup/requirements.txt diff --git a/tests/tests/python-universal-ededup/pail/src/ededup_transform_base.py b/demos/data-prep-kit/universal/ededup/src/ededup_transform_base.py similarity index 100% rename from tests/tests/python-universal-ededup/pail/src/ededup_transform_base.py rename to demos/data-prep-kit/universal/ededup/src/ededup_transform_base.py diff --git a/tests/tests/python-universal-ededup/pail/src/ededup_transform_python.py b/demos/data-prep-kit/universal/ededup/src/ededup_transform_python.py similarity index 100% rename from tests/tests/python-universal-ededup/pail/src/ededup_transform_python.py rename to demos/data-prep-kit/universal/ededup/src/ededup_transform_python.py diff --git a/tests/tests/python-universal-ededup/pail/src/main.py b/demos/data-prep-kit/universal/ededup/src/main.py similarity index 100% rename from tests/tests/python-universal-ededup/pail/src/main.py rename to demos/data-prep-kit/universal/ededup/src/main.py diff --git a/tests/tests/python-universal-ededup/pail/test-data/expected/sample1.parquet.gz b/demos/data-prep-kit/universal/ededup/test-data/NOVALIDATE-sample1.parquet.gz similarity index 100% rename from tests/tests/python-universal-ededup/pail/test-data/expected/sample1.parquet.gz rename to demos/data-prep-kit/universal/ededup/test-data/NOVALIDATE-sample1.parquet.gz diff --git a/tests/tests/python-universal-ededup/pail/test-data/expected/metadata.json.gz b/demos/data-prep-kit/universal/ededup/test-data/expected/metadata.json.gz similarity index 100% rename from tests/tests/python-universal-ededup/pail/test-data/expected/metadata.json.gz rename to demos/data-prep-kit/universal/ededup/test-data/expected/metadata.json.gz diff --git a/tests/tests/python-universal-ededup/pail/test-data/input/sample1.parquet.gz b/demos/data-prep-kit/universal/ededup/test-data/input/sample1.parquet.gz similarity index 100% rename from tests/tests/python-universal-ededup/pail/test-data/input/sample1.parquet.gz rename to demos/data-prep-kit/universal/ededup/test-data/input/sample1.parquet.gz diff --git a/tests/tests/python-universal-filter/pail/requirements.txt b/demos/data-prep-kit/universal/filter/requirements.txt similarity index 100% rename from tests/tests/python-universal-filter/pail/requirements.txt rename to demos/data-prep-kit/universal/filter/requirements.txt diff --git a/tests/tests/python-universal-filter/pail/src/main.py b/demos/data-prep-kit/universal/filter/src/main.py similarity index 100% rename from tests/tests/python-universal-filter/pail/src/main.py rename to demos/data-prep-kit/universal/filter/src/main.py diff --git a/tests/tests/python-universal-filter/pail/test-data/expected/metadata.json.gz b/demos/data-prep-kit/universal/filter/test-data/expected/metadata.json.gz similarity index 100% rename from tests/tests/python-universal-filter/pail/test-data/expected/metadata.json.gz rename to demos/data-prep-kit/universal/filter/test-data/expected/metadata.json.gz diff --git a/tests/tests/python-universal-filter/pail/test-data/expected/test1.parquet.gz b/demos/data-prep-kit/universal/filter/test-data/expected/test1.parquet.gz similarity index 100% rename from tests/tests/python-universal-filter/pail/test-data/expected/test1.parquet.gz rename to demos/data-prep-kit/universal/filter/test-data/expected/test1.parquet.gz diff --git a/tests/tests/python-universal-filter/pail/test-data/input/test1.parquet.gz b/demos/data-prep-kit/universal/filter/test-data/input/test1.parquet.gz similarity index 100% rename from tests/tests/python-universal-filter/pail/test-data/input/test1.parquet.gz rename to demos/data-prep-kit/universal/filter/test-data/input/test1.parquet.gz diff --git a/demos/data-prep-kit/universal/resize/env.yaml b/demos/data-prep-kit/universal/resize/env.yaml new file mode 100644 index 000000000..3c1efa7e7 --- /dev/null +++ b/demos/data-prep-kit/universal/resize/env.yaml @@ -0,0 +1 @@ +max_rows_per_table: 125 diff --git a/tests/tests/python-universal-resize/pail/requirements.txt b/demos/data-prep-kit/universal/resize/requirements.txt similarity index 100% rename from tests/tests/python-universal-resize/pail/requirements.txt rename to demos/data-prep-kit/universal/resize/requirements.txt diff --git a/tests/tests/python-universal-resize/pail/src/main.py b/demos/data-prep-kit/universal/resize/src/main.py similarity index 100% rename from tests/tests/python-universal-resize/pail/src/main.py rename to demos/data-prep-kit/universal/resize/src/main.py diff --git a/tests/tests/python-universal-resize/pail/test-data/expected/test1_0.parquet.gz b/demos/data-prep-kit/universal/resize/test-data/expected/test1_0.parquet.gz similarity index 100% rename from tests/tests/python-universal-resize/pail/test-data/expected/test1_0.parquet.gz rename to demos/data-prep-kit/universal/resize/test-data/expected/test1_0.parquet.gz diff --git a/tests/tests/python-universal-resize/pail/test-data/expected/test1_1.parquet.gz b/demos/data-prep-kit/universal/resize/test-data/expected/test1_1.parquet.gz similarity index 100% rename from tests/tests/python-universal-resize/pail/test-data/expected/test1_1.parquet.gz rename to demos/data-prep-kit/universal/resize/test-data/expected/test1_1.parquet.gz diff --git a/tests/tests/python-universal-resize/pail/test-data/expected/test2_0.parquet.gz b/demos/data-prep-kit/universal/resize/test-data/expected/test2_0.parquet.gz similarity index 100% rename from tests/tests/python-universal-resize/pail/test-data/expected/test2_0.parquet.gz rename to demos/data-prep-kit/universal/resize/test-data/expected/test2_0.parquet.gz diff --git a/tests/tests/python-universal-resize/pail/test-data/expected/test2_1.parquet.gz b/demos/data-prep-kit/universal/resize/test-data/expected/test2_1.parquet.gz similarity index 100% rename from tests/tests/python-universal-resize/pail/test-data/expected/test2_1.parquet.gz rename to demos/data-prep-kit/universal/resize/test-data/expected/test2_1.parquet.gz diff --git a/tests/tests/python-universal-resize/pail/test-data/expected/test3_0.parquet.gz b/demos/data-prep-kit/universal/resize/test-data/expected/test3_0.parquet.gz similarity index 100% rename from tests/tests/python-universal-resize/pail/test-data/expected/test3_0.parquet.gz rename to demos/data-prep-kit/universal/resize/test-data/expected/test3_0.parquet.gz diff --git a/tests/tests/python-universal-resize/pail/test-data/expected/test3_1.parquet.gz b/demos/data-prep-kit/universal/resize/test-data/expected/test3_1.parquet.gz similarity index 100% rename from tests/tests/python-universal-resize/pail/test-data/expected/test3_1.parquet.gz rename to demos/data-prep-kit/universal/resize/test-data/expected/test3_1.parquet.gz diff --git a/tests/tests/python-universal-resize/pail/test-data/input/test1.parquet.gz b/demos/data-prep-kit/universal/resize/test-data/input/test1.parquet.gz similarity index 100% rename from tests/tests/python-universal-resize/pail/test-data/input/test1.parquet.gz rename to demos/data-prep-kit/universal/resize/test-data/input/test1.parquet.gz diff --git a/tests/tests/python-universal-resize/pail/test-data/input/test2.parquet.gz b/demos/data-prep-kit/universal/resize/test-data/input/test2.parquet.gz similarity index 100% rename from tests/tests/python-universal-resize/pail/test-data/input/test2.parquet.gz rename to demos/data-prep-kit/universal/resize/test-data/input/test2.parquet.gz diff --git a/tests/tests/python-universal-resize/pail/test-data/input/test3.parquet.gz b/demos/data-prep-kit/universal/resize/test-data/input/test3.parquet.gz similarity index 100% rename from tests/tests/python-universal-resize/pail/test-data/input/test3.parquet.gz rename to demos/data-prep-kit/universal/resize/test-data/input/test3.parquet.gz diff --git a/demos/data-prep-kit/universal/tokenization/command b/demos/data-prep-kit/universal/tokenization/command new file mode 100644 index 000000000..55259c7a0 --- /dev/null +++ b/demos/data-prep-kit/universal/tokenization/command @@ -0,0 +1 @@ +python3.12 main.py \ No newline at end of file diff --git a/demos/data-prep-kit/universal/tokenization/image b/demos/data-prep-kit/universal/tokenization/image new file mode 100644 index 000000000..e5cf57b9a --- /dev/null +++ b/demos/data-prep-kit/universal/tokenization/image @@ -0,0 +1 @@ +docker.io/python:3.12 \ No newline at end of file diff --git a/tests/tests/python-universal-tokenization/pail/requirements.txt b/demos/data-prep-kit/universal/tokenization/requirements.txt similarity index 100% rename from tests/tests/python-universal-tokenization/pail/requirements.txt rename to demos/data-prep-kit/universal/tokenization/requirements.txt diff --git a/tests/tests/python-universal-tokenization/pail/src/main.py b/demos/data-prep-kit/universal/tokenization/src/main.py similarity index 100% rename from tests/tests/python-universal-tokenization/pail/src/main.py rename to demos/data-prep-kit/universal/tokenization/src/main.py diff --git a/tests/tests/python-universal-tokenization/pail/src/tokenization_utils.py b/demos/data-prep-kit/universal/tokenization/src/tokenization_utils.py similarity index 100% rename from tests/tests/python-universal-tokenization/pail/src/tokenization_utils.py rename to demos/data-prep-kit/universal/tokenization/src/tokenization_utils.py diff --git a/tests/tests/python-universal-tokenization/pail/test-data/ds01/expected/lang=en/pq01.parquet.gz b/demos/data-prep-kit/universal/tokenization/test-data/expected/ds01_pq01.parquet.gz similarity index 100% rename from tests/tests/python-universal-tokenization/pail/test-data/ds01/expected/lang=en/pq01.parquet.gz rename to demos/data-prep-kit/universal/tokenization/test-data/expected/ds01_pq01.parquet.gz diff --git a/tests/tests/python-universal-tokenization/pail/test-data/ds01/expected/lang=en/pq02.parquet.gz b/demos/data-prep-kit/universal/tokenization/test-data/expected/ds01_pq02.parquet.gz similarity index 100% rename from tests/tests/python-universal-tokenization/pail/test-data/ds01/expected/lang=en/pq02.parquet.gz rename to demos/data-prep-kit/universal/tokenization/test-data/expected/ds01_pq02.parquet.gz diff --git a/tests/tests/python-universal-tokenization/pail/test-data/ds01/input/lang=en/pq01.parquet.gz b/demos/data-prep-kit/universal/tokenization/test-data/input/ds01_pq01.parquet.gz similarity index 100% rename from tests/tests/python-universal-tokenization/pail/test-data/ds01/input/lang=en/pq01.parquet.gz rename to demos/data-prep-kit/universal/tokenization/test-data/input/ds01_pq01.parquet.gz diff --git a/tests/tests/python-universal-tokenization/pail/test-data/ds01/input/lang=en/pq02.parquet.gz b/demos/data-prep-kit/universal/tokenization/test-data/input/ds01_pq02.parquet.gz similarity index 100% rename from tests/tests/python-universal-tokenization/pail/test-data/ds01/input/lang=en/pq02.parquet.gz rename to demos/data-prep-kit/universal/tokenization/test-data/input/ds01_pq02.parquet.gz diff --git a/pkg/boot/bat.go b/pkg/boot/bat.go new file mode 100644 index 000000000..b1a0f7238 --- /dev/null +++ b/pkg/boot/bat.go @@ -0,0 +1,161 @@ +//go:build full || manage + +package boot + +import ( + "bufio" + "context" + "fmt" + "io" + "io/fs" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "slices" + + "golang.org/x/sync/errgroup" + + "lunchpail.io/pkg/be" + "lunchpail.io/pkg/build" + "lunchpail.io/pkg/fe/builder" + "lunchpail.io/pkg/fe/builder/overlay" + "lunchpail.io/pkg/observe/colors" +) + +type BuildAndTester struct { + Concurrency int + be.Backend + build.Options +} + +// Run build&test for all applications in all of the given `dirs` +func (t BuildAndTester) RunAll(ctx context.Context, dirs []string) error { + fmt.Fprintln(os.Stderr, "Starting build and test for", dirs) + + dirForBinaries, err := ioutil.TempDir("", "lunchpail-bat-") + if err != nil { + return err + } + defer os.RemoveAll(dirForBinaries) + + group, gctx := errgroup.WithContext(ctx) + if t.Concurrency != 0 { + group.SetLimit(t.Concurrency) + } + + for _, dir := range dirs { + if err := t.RunDir(gctx, group, dir, dirForBinaries); err != nil { + return err + } + } + + return group.Wait() +} + +// Run build&test for all applications in the given `dir` +func (t BuildAndTester) RunDir(ctx context.Context, group *errgroup.Group, dir, dirForBinaries string) error { + return filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error { + if d.Name() == "src" || d.Name() == "test-data" { + return fs.SkipDir + } else if !d.IsDir() || d.Name() == filepath.Base(dir) { + return nil + } + + if files, err := os.ReadDir(path); err != nil { + return err + } else if slices.IndexFunc(files, func(f fs.DirEntry) bool { return f.Name() == "src" || f.Name() == "test-data" }) < 0 { + // not an app directory + if t.Options.Verbose() { + fmt.Fprintln(os.Stderr, "Skipping build and test for", path) + } + return nil + } + + group.Go(func() error { + binaryRelPath, err := filepath.Rel(dir, path) + if err != nil { + return err + } + binaryFullPath := filepath.Join(dirForBinaries, binaryRelPath) + return t.Run(ctx, path, binaryRelPath, binaryFullPath) + }) + + return nil + }) +} + +// Run one build&test for the application specified in `sourcePath`, storing the build in `binaryFullPath` +func (t BuildAndTester) Run(ctx context.Context, sourcePath, binaryRelPath, binaryFullPath string) error { + select { + case <-ctx.Done(): + return nil + default: + } + + if err := builder.Build( + ctx, + sourcePath, + builder.Options{ + Name: binaryFullPath, + OverlayOptions: overlay.Options{BuildOptions: t.Options}, + }, + ); err != nil { + return err + } + + args := []string{"test"} + if t.Options.Verbose() { + args = append(args, "--verbose") + } + + cmd := exec.CommandContext(ctx, binaryFullPath, args...) + + stdout, err := cmd.StdoutPipe() + if err != nil { + return err + } + stderr, err := cmd.StderrPipe() + if err != nil { + return err + } + if err := cmd.Start(); err != nil { + fmt.Fprintf(os.Stderr, "Error launching test %s: %v\n", binaryRelPath, err) + return err + } + doneout := make(chan struct{}) + doneerr := make(chan struct{}) + + go pipe(binaryRelPath, stdout, os.Stdout, doneout) + go pipe(binaryRelPath, stderr, os.Stderr, doneerr) + + select { + case <-ctx.Done(): + return nil + case <-doneout: + } + select { + case <-ctx.Done(): + return nil + case <-doneerr: + } + + return cmd.Wait() +} + +// Pipe the output of the test, prefixing emitted lines with the given prefix (application name) +func pipe(prefix string, r io.Reader, w io.Writer, done chan<- struct{}) { + reader := bufio.NewReader(r) + + for { + line, _, err := reader.ReadLine() + + if err == io.EOF { + break + } + + fmt.Fprintf(w, "%s %s\n", colors.Yellow.Render(prefix), line) + } + + done <- struct{}{} +} diff --git a/pkg/boot/io.go b/pkg/boot/io.go index d320e76a9..6f854a45d 100644 --- a/pkg/boot/io.go +++ b/pkg/boot/io.go @@ -63,7 +63,7 @@ func catAndRedirect(ctx context.Context, inputs []string, backend be.Backend, ir // TODO: backend.Wait(ir)? which would be a no-op for local // If we aren't piped into anything, then copy out the outbox files - if isFinalStep(ir.Context) && !noRedirect { + if redirectTo != "" || isFinalStep(ir.Context) && !noRedirect { // We try to place the output files in the same // directory as the respective input files. TODO: this // may be a fool's errand, e.g. what if a single input diff --git a/pkg/boot/tester.go b/pkg/boot/tester.go index 80d76dd85..bbe3f3968 100644 --- a/pkg/boot/tester.go +++ b/pkg/boot/tester.go @@ -20,6 +20,7 @@ import ( ) type Tester struct { + Quiet bool be.Backend build.Options } @@ -48,7 +49,10 @@ func (t Tester) prepareInputs(testData hlir.TestData, stageDir string) (inputs [ expectedDir := build.TestDataDirForExpected(stageDir) for _, test := range testData { inputs = append(inputs, filepath.Join(inputDir, test.Input)) - outputs = append(outputs, filepath.Join(expectedDir, test.Expected)) + + for _, expected := range test.Expected { + outputs = append(outputs, filepath.Join(expectedDir, expected)) + } } if t.Options.Verbose() { @@ -59,7 +63,7 @@ func (t Tester) prepareInputs(testData hlir.TestData, stageDir string) (inputs [ } func (t Tester) Run(ctx context.Context, inputs []string, expected []string) error { - fmt.Fprintf(os.Stderr, "Scheduling %s for %s\n", english.Plural(len(inputs), "test", ""), build.Name()) + fmt.Fprintf(os.Stderr, "Testing %s\n", english.Plural(len(inputs), "input", "")) if slices.IndexFunc(inputs, func(input string) bool { return filepath.Ext(input) == ".gz" }) >= 0 { t.Options.Gunzip = true @@ -73,7 +77,9 @@ func (t Tester) Run(ctx context.Context, inputs []string, expected []string) err defer os.RemoveAll(redirectTo) } - if err := Up(ctx, t.Backend, UpOptions{Inputs: inputs, BuildOptions: t.Options, RedirectTo: redirectTo}); err != nil { + if runContext, err := Up(ctx, t.Backend, UpOptions{Inputs: inputs, BuildOptions: t.Options, RedirectTo: redirectTo, Watch: !t.Quiet}); err != nil { + return err + } else if err := Down(ctx, runContext.Run.RunName, t.Backend, DownOptions{Namespace: t.Options.Target.Namespace, Verbose: t.Options.Verbose()}); err != nil { return err } @@ -105,8 +111,8 @@ func (t Tester) validate(inputs []string, expecteds []string, redirectTo string) } found := 0 - for idx, expected := range expecteds { - expectedFileName := filepath.Base(inputs[idx]) + for _, expected := range expecteds { + expectedFileName := filepath.Base(expected) // TODO O(N^2) for _, actual := range actuals { @@ -128,7 +134,7 @@ func (t Tester) validate(inputs []string, expecteds []string, redirectTo string) if ok, err := t.equal(matchesWithGunzip, expectedBytes, actualBytes); err != nil { return err } else if !ok { - return fmt.Errorf("actual!=expected for %s", filepath.Base(inputs[idx])) + return fmt.Errorf("actual!=expected for %s", expectedFileName) } } } diff --git a/pkg/boot/up.go b/pkg/boot/up.go index d3c28b7be..a0991216e 100644 --- a/pkg/boot/up.go +++ b/pkg/boot/up.go @@ -24,24 +24,26 @@ type UpOptions struct { Inputs []string DryRun bool Watch bool + WatchUtil bool BuildOptions build.Options Executable string NoRedirect bool RedirectTo string } -func Up(ctx context.Context, backend be.Backend, opts UpOptions) error { +func Up(ctx context.Context, backend be.Backend, opts UpOptions) (llir.Context, error) { pipelineContext, err := handlePipelineStdin() if err != nil { - return err + return llir.Context{}, err } ir, err := fe.PrepareForRun(pipelineContext, fe.PrepareOptions{}, opts.BuildOptions) if err != nil { - return err + return llir.Context{}, err } - return upLLIR(ctx, backend, ir, opts) + err = upLLIR(ctx, backend, ir, opts) + return ir.Context, err } func UpHLIR(ctx context.Context, backend be.Backend, ir hlir.HLIR, opts UpOptions) error { @@ -111,7 +113,7 @@ func upLLIR(ctx context.Context, backend be.Backend, ir llir.LLIR, opts UpOption } }() - if opts.Watch && !util.StdoutIsTty() { + if opts.Watch && opts.RedirectTo == "" && !util.StdoutIsTty() { // if stdout is not a tty, then we can't support // watch, no matter what the user asked for fmt.Fprintf(os.Stderr, "Warning: disabling watch mode because stdout is not a tty\n") @@ -188,7 +190,10 @@ func upLLIR(ctx context.Context, backend be.Backend, ir llir.LLIR, opts UpOption case <-isRunning6: } go watchLogs(cancellable, backend, ir, logsDone, WatchOptions{Verbose: verbose}) - go watchUtilization(cancellable, backend, ir, WatchOptions{Verbose: verbose}) + + if opts.WatchUtil { + go watchUtilization(cancellable, backend, ir, WatchOptions{Verbose: verbose}) + } }() } @@ -196,8 +201,10 @@ func upLLIR(ctx context.Context, backend be.Backend, ir llir.LLIR, opts UpOption select { case <-cancellable.Done(): case ctx := <-isRunning6: - if err := handlePipelineStdout(ctx); err != nil { - fmt.Fprintln(os.Stderr, err) + if opts.RedirectTo == "" { + if err := handlePipelineStdout(ctx); err != nil { + fmt.Fprintln(os.Stderr, err) + } } } }() diff --git a/pkg/build/options.go b/pkg/build/options.go index d880d86be..271e7bbe9 100644 --- a/pkg/build/options.go +++ b/pkg/build/options.go @@ -46,6 +46,9 @@ type Options struct { // Gunzip inputs before passing them to the worker logic Gunzip bool `yaml:",omitempty"` + + // Clean up any caches prior to exiting + AutoClean bool `yaml:"autoClean,omitempty"` } //go:embed buildOptions.json diff --git a/pkg/fe/builder/build.go b/pkg/fe/builder/build.go index 80d883e5d..2819c690f 100644 --- a/pkg/fe/builder/build.go +++ b/pkg/fe/builder/build.go @@ -9,6 +9,7 @@ import ( "lunchpail.io/pkg/build" "lunchpail.io/pkg/fe/builder/overlay" + "lunchpail.io/pkg/observe/colors" "lunchpail.io/pkg/util" ) @@ -47,7 +48,7 @@ func Build(ctx context.Context, sourcePath string, opts Options) error { // Second, pick a name for the resulting build. TODO: allow command line override? buildName := buildNameFrom(sourcePath) - fmt.Fprintf(os.Stderr, "Building %s\n", buildName) + fmt.Fprintf(os.Stderr, "%s Building...\n", colors.Yellow.Render(buildName)) // Third, overlay source (if given) appTemplatePath, appVersion, hasTestData, err := overlay.OverlaySourceOntoPriorBuild(buildName, sourcePath, opts.OverlayOptions) diff --git a/pkg/fe/builder/overlay/filesystem.go b/pkg/fe/builder/overlay/filesystem.go index c0bd0abc0..23382329d 100644 --- a/pkg/fe/builder/overlay/filesystem.go +++ b/pkg/fe/builder/overlay/filesystem.go @@ -9,12 +9,15 @@ import ( "regexp" "runtime" "slices" + "strconv" "strings" + "github.com/dustin/go-humanize/english" "gopkg.in/yaml.v3" "lunchpail.io/pkg/build" "lunchpail.io/pkg/ir/hlir" + "lunchpail.io/pkg/observe/colors" ) type filesystemBuilder struct { @@ -269,10 +272,32 @@ func (b filesystemBuilder) addTestData(spec *hlir.Spec, sourcePath, templatePath output := filepath.Join(expectedDir, input.Name()) if _, err := os.Stat(output); err != nil { - // Then the application does not provided expected output - fmt.Fprintln(os.Stderr, "Warning: expected output not provided for", input.Name()) + // Hmm, check if it exists with a .gz extension + output = filepath.Join(expectedDir, input.Name()+".gz") + if _, err := os.Stat(output); err != nil { + // Hmm, check if it exists with _0, _1, ... extensions + idx := strings.Index(input.Name(), ".") + if idx >= 0 { + outputNum := 0 + for { + output = filepath.Join(expectedDir, input.Name()[:idx]+"_"+strconv.Itoa(outputNum)+input.Name()[idx:]) + if _, err := os.Stat(output); err != nil { + break + } + test.Expected = append(test.Expected, filepath.Base(output)) + outputNum++ + } + } + } else { + test.Expected = []string{filepath.Base(output)} + } } else { - test.Expected = input.Name() + test.Expected = []string{filepath.Base(output)} + } + + if len(test.Expected) == 0 { + // Then the application does not provided expected output + fmt.Fprintf(os.Stderr, "%s Warning: expected output not provided for %s\n", colors.Yellow.Render(b.appname), input.Name()) } spec.TestData = append(spec.TestData, test) @@ -280,8 +305,8 @@ func (b filesystemBuilder) addTestData(spec *hlir.Spec, sourcePath, templatePath } } - if b.verbose && len(spec.TestData) > 0 { - fmt.Fprintf(os.Stderr, "Application provided %d test inputs\n", len(spec.TestData)) + if len(spec.TestData) > 0 { + fmt.Fprintf(os.Stderr, "%s Application provided %s\n", colors.Yellow.Render(b.appname), english.Plural(len(spec.TestData), "test input", "")) } return nil diff --git a/pkg/fe/transformer/api/shell/lower.go b/pkg/fe/transformer/api/shell/lower.go index 6eec6a32b..894c0924d 100644 --- a/pkg/fe/transformer/api/shell/lower.go +++ b/pkg/fe/transformer/api/shell/lower.go @@ -48,6 +48,11 @@ func LowerAsComponent(buildName string, ctx llir.Context, app hlir.Application, app.Spec.Env["LUNCHPAIL_STEP"] = strconv.Itoa(ctx.Run.Step) app.Spec.Env["LUNCHPAIL_QUEUE_BUCKET"] = ctx.Queue.Bucket + clean := "" + if opts.AutoClean { + clean = `trap "echo 'Cleaning up venv $(dirname $venvBin)'; rm -rf $(dirname $venvBin)" EXIT` + } + for _, needs := range app.Spec.Needs { var req string @@ -59,8 +64,10 @@ func LowerAsComponent(buildName string, ctx llir.Context, app hlir.Application, } component.Spec.Command = fmt.Sprintf(`set -e -PATH=$($LUNCHPAIL_EXE needs %s %s %s --verbose=%v):$PATH -%s`, needs.Name, needs.Version, req, opts.Log.Verbose, component.Spec.Command) +venvBin="$($LUNCHPAIL_EXE needs %s %s %s --verbose=%v)" +PATH="$venvBin":$PATH +%s +%s`, needs.Name, needs.Version, req, opts.Log.Verbose, clean, component.Spec.Command) } for _, dataset := range app.Spec.Datasets { diff --git a/pkg/ir/hlir/testdata.go b/pkg/ir/hlir/testdata.go index 7fe9e68f5..55a5ad60f 100644 --- a/pkg/ir/hlir/testdata.go +++ b/pkg/ir/hlir/testdata.go @@ -1,9 +1,11 @@ package hlir type TestDatum struct { - Name string - Input string - Expected string + Name string + Input string + + // Each Input may provide 0 or more Expected outputs, hence the array + Expected []string } type TestData = []TestDatum diff --git a/pkg/observe/colors/styles.go b/pkg/observe/colors/styles.go index ec02d4ddf..f7567ceec 100644 --- a/pkg/observe/colors/styles.go +++ b/pkg/observe/colors/styles.go @@ -23,21 +23,34 @@ var Red = lipgloss.NewStyle().Foreground(redColor) var Gray = lipgloss.NewStyle().Foreground(grayColor) var Cyan = lipgloss.NewStyle().Foreground(cyanColor) +var BlueBackground = lipgloss.NewStyle().Background(blueColor).Foreground(blackColor).Padding(0, 1) +var LightBlueBackground = lipgloss.NewStyle().Background(lightblueColor).Foreground(blackColor).Padding(0, 1) +var LightBrownBackground = lipgloss.NewStyle().Background(lightbrownColor).Foreground(blackColor).Padding(0, 1) +var GrayBackground = lipgloss.NewStyle().Background(grayColor).Foreground(blackColor).Padding(0, 1) +var RedBackground = lipgloss.NewStyle().Background(redColor).Foreground(blackColor).Padding(0, 1) +var Spectrum = []lipgloss.Style{ + BlueBackground, + LightBlueBackground, + LightBrownBackground, + GrayBackground, + RedBackground, +} + // https://colorbrewer2.org/#type=qualitative&scheme=Paired&n=5 var DispatcherMessageStyle = lipgloss.NewStyle().Foreground(blueColor) -var DispatcherComponentStyle = lipgloss.NewStyle().Background(blueColor).Foreground(blackColor).Padding(0, 1) +var DispatcherComponentStyle = BlueBackground var WorkersMessageStyle = lipgloss.NewStyle().Foreground(lightblueColor) -var WorkersComponentStyle = lipgloss.NewStyle().Background(lightblueColor).Foreground(blackColor).Padding(0, 1) +var WorkersComponentStyle = LightBlueBackground var WorkStealerMessageStyle = lipgloss.NewStyle().Foreground(lightbrownColor).Faint(true) -var WorkStealerComponentStyle = lipgloss.NewStyle().Background(lightbrownColor).Foreground(blackColor).Padding(0, 1) +var WorkStealerComponentStyle = LightBrownBackground var MinioComponentStyle = lipgloss.NewStyle().Background(lightyellowColor).Foreground(blackColor).Padding(0, 1) -var ClusterComponentStyle = lipgloss.NewStyle().Background(grayColor).Foreground(blackColor).Padding(0, 1) +var ClusterComponentStyle = GrayBackground var OtherComponentStyle = lipgloss.NewStyle().Padding(0, 1) -var ErrorComponentStyle = lipgloss.NewStyle().Background(redColor).Foreground(blackColor).Padding(0, 1) +var ErrorComponentStyle = RedBackground func ComponentStyle(c lunchpail.Component) lipgloss.Style { switch c { diff --git a/tests/tests/python-code-code-quality/post.sh b/tests/tests/python-code-code-quality/post.sh deleted file mode 100755 index 2fe5c82b3..000000000 --- a/tests/tests/python-code-code-quality/post.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env bash - -DATA="$TEST_PATH"/pail/test-data - -function validate { - actual="$1" - expected="$2" - - while true - do - if [ -f $actual ] - then echo "✅ PASS found local task output file=$actual test=$TEST_NAME" && break - else echo "Still waiting for local task output file=$actual test=$TEST_NAME" && sleep 1 - fi - done - -# actual_sha256=$(cat "$actual" | sha256sum) -# expected_sha256=$(gunzip -c "$expected" | sha256sum) - -# if [ "$actual_sha256" = "$expected_sha256" ] -# then echo "✅ PASS the output file is valid file=$actual test=$TEST_NAME" -# else echo "❌ FAIL mismatched sha256 on output file file=$actual actual_sha256=$actual_sha256 expected_sha256=$expected_sha256 test=$TEST_NAME" && exit 1 -# fi - - rm -f "$actual" -} - -validate sample_1.parquet "$DATA"/expected/sample_1.parquet.gz -validate sample_2.parquet "$DATA"/expected/sample_2.parquet.gz diff --git a/tests/tests/python-code-code-quality/settings.sh b/tests/tests/python-code-code-quality/settings.sh deleted file mode 100644 index 41f43f879..000000000 --- a/tests/tests/python-code-code-quality/settings.sh +++ /dev/null @@ -1,9 +0,0 @@ -api=workqueue - -expected=("input table has 2 rows and 2 columns" "output table has 2 rows and 14 columns" "input table has 2 rows and 2 columns" "output table has 2 rows and 14 columns") -NUM_DESIRED_OUTPUTS=0 - -# the default is --yaml. we don't want that -source_from=" " - -up_args='--gunzip "$TEST_PATH"/pail/test-data/input/sample_1.parquet.gz "$TEST_PATH"/pail/test-data/input/sample_2.parquet.gz' diff --git a/tests/tests/python-code-code2parquet/pail/.helmignore b/tests/tests/python-code-code2parquet/pail/.helmignore deleted file mode 100644 index 6350f0fe2..000000000 --- a/tests/tests/python-code-code2parquet/pail/.helmignore +++ /dev/null @@ -1 +0,0 @@ -test-data/ diff --git a/tests/tests/python-code-code2parquet/post.sh b/tests/tests/python-code-code2parquet/post.sh deleted file mode 120000 index 20f68cf81..000000000 --- a/tests/tests/python-code-code2parquet/post.sh +++ /dev/null @@ -1 +0,0 @@ -../python-language-html2parquet/post.sh \ No newline at end of file diff --git a/tests/tests/python-code-code2parquet/settings.sh b/tests/tests/python-code-code2parquet/settings.sh deleted file mode 100644 index f3774dc69..000000000 --- a/tests/tests/python-code-code2parquet/settings.sh +++ /dev/null @@ -1,9 +0,0 @@ -api=workqueue - -expected=("Done with number_of_rows=2" "Done with number_of_rows=20" "Done with number_of_rows=52") -NUM_DESIRED_OUTPUTS=0 - -# the default is --yaml. we don't want that -source_from=" " - -up_args='"$TEST_PATH"/pail/test-data/input/application-java.zip "$TEST_PATH"/pail/test-data/input/data-processing-lib.zip "$TEST_PATH"/pail/test-data/input/https___github.com_00000o1_environments_archive_refs_heads_master.zip' diff --git a/tests/tests/python-code-header-cleanser/pail/.helmignore b/tests/tests/python-code-header-cleanser/pail/.helmignore deleted file mode 100644 index 6350f0fe2..000000000 --- a/tests/tests/python-code-header-cleanser/pail/.helmignore +++ /dev/null @@ -1 +0,0 @@ -test-data/ diff --git a/tests/tests/python-code-header-cleanser/pail/test-data/expected/metadata.json b/tests/tests/python-code-header-cleanser/pail/test-data/expected/metadata.json deleted file mode 100644 index 575fe5470..000000000 --- a/tests/tests/python-code-header-cleanser/pail/test-data/expected/metadata.json +++ /dev/null @@ -1,44 +0,0 @@ -{ - "pipeline": "pipeline_id", - "job details": { - "job category": "preprocessing", - "job name": "header_cleanser", - "job type": "pure python", - "job id": "job_id", - "start_time": "2024-06-16 09:05:43", - "end_time": "2024-06-16 09:05:50", - "status": "success" - }, - "code": { - "github": "github", - "commit_hash": "12345", - "path": "path" - }, - "job_input_params": { - "contents_column_name": "contents", - "license": "true", - "copyright": "true", - "checkpointing": false, - "max_files": -1, - "random_samples": -1, - "files_to_use": [".parquet"] - }, - "job_output_stats": { - "source_files": 1, - "source_size": 17466, - "result_files": 1, - "result_size": 38953, - "processing_time": 7.257367134094238, - "Removed code count": 9, - "source_doc_count": 10, - "result_doc_count": 10 - }, - "source": { - "name": "/home/yash/git_fork_3/data-prep-kit/transforms/code/header_cleanser/python/test-data/input", - "type": "path" - }, - "target": { - "name": "/home/yash/git_fork_3/data-prep-kit/transforms/code/header_cleanser/python/test-data/expected/license-and-copyright", - "type": "path" - } -} diff --git a/tests/tests/python-code-header-cleanser/pail/test-data/input/test1.parquet.output.gz b/tests/tests/python-code-header-cleanser/pail/test-data/input/test1.parquet.output.gz deleted file mode 100644 index 7f25b6c5e..000000000 Binary files a/tests/tests/python-code-header-cleanser/pail/test-data/input/test1.parquet.output.gz and /dev/null differ diff --git a/tests/tests/python-code-header-cleanser/post.sh b/tests/tests/python-code-header-cleanser/post.sh deleted file mode 100755 index 6b5db4d4a..000000000 --- a/tests/tests/python-code-header-cleanser/post.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env bash - -DATA="$TEST_PATH"/pail/test-data - -function validate { - actual="$1" - expected="$2" - - while true - do - if [ -f $actual ] - then echo "✅ PASS found local task output file=$actual test=$TEST_NAME" && break - else echo "Still waiting for local task output file=$actual test=$TEST_NAME" && sleep 1 - fi - done - -# actual_sha256=$(cat "$actual" | sha256sum) -# expected_sha256=$(gunzip -c "$expected" | sha256sum) -# -# if [ "$actual_sha256" = "$expected_sha256" ] -# then echo "✅ PASS the output file is valid file=$actual test=$TEST_NAME" -# else echo "❌ FAIL mismatched sha256 on output file file=$actual actual_sha256=$actual_sha256 expected_sha256=$expected_sha256 test=$TEST_NAME" && exit 1 -# fi - - rm -f "$actual" -} - -validate test1.parquet "$DATA"/expected/test1.parquet.gz diff --git a/tests/tests/python-code-header-cleanser/settings.sh b/tests/tests/python-code-header-cleanser/settings.sh deleted file mode 100644 index 06067141e..000000000 --- a/tests/tests/python-code-header-cleanser/settings.sh +++ /dev/null @@ -1,9 +0,0 @@ -api=workqueue - -expected=("input table has 10 rows" "output table has 10 rows") -NUM_DESIRED_OUTPUTS=0 - -# the default is --yaml. we don't want that -source_from=" " - -up_args='--gunzip "$TEST_PATH"/pail/test-data/input/test1.parquet.gz' diff --git a/tests/tests/python-language-doc-chunk/pail/.helmignore b/tests/tests/python-language-doc-chunk/pail/.helmignore deleted file mode 100644 index 6350f0fe2..000000000 --- a/tests/tests/python-language-doc-chunk/pail/.helmignore +++ /dev/null @@ -1 +0,0 @@ -test-data/ diff --git a/tests/tests/python-language-doc-chunk/post.sh b/tests/tests/python-language-doc-chunk/post.sh deleted file mode 100755 index 2e63ddf44..000000000 --- a/tests/tests/python-language-doc-chunk/post.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/sh - -DATA="$TEST_PATH"/pail/test-data - -for i in $(seq 1 1) -do - actual=test${i}_0.parquet - expected="$DATA"/expected/test${i}_0.parquet.gz - - while true - do - if [ -f $actual ] - then echo "✅ PASS found local task output file=$actual test=$TEST_NAME" && break - else echo "Still waiting for local task output file=$actual test=$TEST_NAME" && sleep 1 - fi - done - - actual_sha256=$(cat "$actual" | sha256sum) - expected_sha256=$(gunzip -c "$expected" | sha256sum) - - if [ "$actual_sha256" = "$expected_sha256" ] - then echo "✅ PASS the output file is valid file=$actual test=$TEST_NAME" - else echo "❌ FAIL mismatched sha256 on output file file=$actual actual_sha256=$actual_sha256 expected_sha256=$expected_sha256 test=$TEST_NAME" && exit 1 - fi - - rm -f "$actual" -done diff --git a/tests/tests/python-language-doc-chunk/settings.sh b/tests/tests/python-language-doc-chunk/settings.sh deleted file mode 100644 index d726ccee7..000000000 --- a/tests/tests/python-language-doc-chunk/settings.sh +++ /dev/null @@ -1,9 +0,0 @@ -api=workqueue - -expected=("Transforming one table with 1 rows" "Done with nfiles=1 nrows=88") -NUM_DESIRED_OUTPUTS=0 - -# the default is --yaml. we don't want that -source_from=" " - -up_args='--gunzip "$TEST_PATH"/pail/test-data/input/test1.parquet.gz' diff --git a/tests/tests/python-language-doc-quality/pail/.helmignore b/tests/tests/python-language-doc-quality/pail/.helmignore deleted file mode 100644 index 6350f0fe2..000000000 --- a/tests/tests/python-language-doc-quality/pail/.helmignore +++ /dev/null @@ -1 +0,0 @@ -test-data/ diff --git a/tests/tests/python-language-doc-quality/post.sh b/tests/tests/python-language-doc-quality/post.sh deleted file mode 100755 index 0ec75b1b4..000000000 --- a/tests/tests/python-language-doc-quality/post.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/sh - -DATA="$TEST_PATH"/pail/test-data - -for i in $(seq 1 1) -do - actual=test$i.parquet - expected="$DATA"/expected/test$i.parquet.gz - - while true - do - if [ -f $actual ] - then echo "✅ PASS found local task output file=$actual test=$TEST_NAME" && break - else echo "Still waiting for local task output file=$actual test=$TEST_NAME" && sleep 1 - fi - done - - actual_sha256=$(cat "$actual" | sha256sum) - expected_sha256=$(gunzip -c "$expected" | sha256sum) - - if [ "$actual_sha256" = "$expected_sha256" ] - then echo "✅ PASS the output file is valid file=$actual test=$TEST_NAME" - else echo "❌ FAIL mismatched sha256 on output file file=$actual actual_sha256=$actual_sha256 expected_sha256=$expected_sha256 test=$TEST_NAME" && exit 1 - fi - - rm -f "$actual" -done diff --git a/tests/tests/python-language-doc-quality/settings.sh b/tests/tests/python-language-doc-quality/settings.sh deleted file mode 100644 index b84168f46..000000000 --- a/tests/tests/python-language-doc-quality/settings.sh +++ /dev/null @@ -1,9 +0,0 @@ -api=workqueue - -expected=("Load badwords found locally" "Done. Writing output to") -NUM_DESIRED_OUTPUTS=0 - -# the default is --yaml. we don't want that -source_from=" " - -up_args='--gunzip "$TEST_PATH"/pail/test-data/input/test1.parquet.gz' diff --git a/tests/tests/python-language-html2parquet/pail/.helmignore b/tests/tests/python-language-html2parquet/pail/.helmignore deleted file mode 100644 index 6350f0fe2..000000000 --- a/tests/tests/python-language-html2parquet/pail/.helmignore +++ /dev/null @@ -1 +0,0 @@ -test-data/ diff --git a/tests/tests/python-language-html2parquet/post.sh b/tests/tests/python-language-html2parquet/post.sh deleted file mode 100755 index d7a16e203..000000000 --- a/tests/tests/python-language-html2parquet/post.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash - -DATA="$TEST_PATH"/pail/test-data - -for i in "$DATA"/input/* -do - b=$(basename $i) - if [[ "$b" =~ "output" ]] - then continue - fi - - ext=${b##*.} - bb=${b%.*} - actual="$(dirname $i)"/"$bb".output.$ext - expected="$DATA"/expected/$bb.parquet.gz - - while true - do - if [ -f $actual ] - then echo "✅ PASS found local task output file=$actual test=$TEST_NAME" && break - else echo "Still waiting for local task output file=$actual test=$TEST_NAME" && sleep 1 - fi - done - - actual_sha256=$(cat "$actual" | sha256sum) - expected_sha256=$(gunzip -c "$expected" | sha256sum) - - if [ "$actual_sha256" = "$expected_sha256" ] - then echo "✅ PASS the output file is valid file=$actual test=$TEST_NAME" - else echo "❌ FAIL mismatched sha256 on output file file=$actual expected=$expected actual_sha256=$actual_sha256 expected_sha256=$expected_sha256 test=$TEST_NAME" && exit 1 - fi - - rm -f "$actual" -done diff --git a/tests/tests/python-language-html2parquet/settings.sh b/tests/tests/python-language-html2parquet/settings.sh deleted file mode 100644 index ad3528fb9..000000000 --- a/tests/tests/python-language-html2parquet/settings.sh +++ /dev/null @@ -1,9 +0,0 @@ -api=workqueue - -expected=("Done with nrows=1" "Done with nrows=2") -NUM_DESIRED_OUTPUTS=0 - -# the default is --yaml. we don't want that -source_from=" " - -up_args='"$TEST_PATH"/pail/test-data/input/test1.html "$TEST_PATH"/pail/test-data/input/html_zip.zip' diff --git a/tests/tests/python-language-lang-id/pail/.helmignore b/tests/tests/python-language-lang-id/pail/.helmignore deleted file mode 100644 index 6350f0fe2..000000000 --- a/tests/tests/python-language-lang-id/pail/.helmignore +++ /dev/null @@ -1 +0,0 @@ -test-data/ diff --git a/tests/tests/python-language-lang-id/post.sh b/tests/tests/python-language-lang-id/post.sh deleted file mode 100755 index 90675773c..000000000 --- a/tests/tests/python-language-lang-id/post.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/sh - -DATA="$TEST_PATH"/pail/test-data/sm - -for i in $(seq 1 3) -do - actual=test_0$i.parquet - expected="$DATA"/expected/test_0$i.parquet.gz - - while true - do - if [ -f $actual ] - then echo "✅ PASS found local task output file=$actual test=$TEST_NAME" && break - else echo "Still waiting for local task output file=$actual test=$TEST_NAME" && sleep 1 - fi - done - - actual_sha256=$(cat "$actual" | sha256sum) - expected_sha256=$(gunzip -c "$expected" | sha256sum) - - if [ "$actual_sha256" = "$expected_sha256" ] - then echo "✅ PASS the output file is valid file=$actual test=$TEST_NAME" - else echo "❌ FAIL mismatched sha256 on output file file=$actual actual_sha256=$actual_sha256 expected_sha256=$expected_sha256 test=$TEST_NAME" && exit 1 - fi - - rm -f "$actual" -done diff --git a/tests/tests/python-language-lang-id/settings.sh b/tests/tests/python-language-lang-id/settings.sh deleted file mode 100644 index 49cb4a58b..000000000 --- a/tests/tests/python-language-lang-id/settings.sh +++ /dev/null @@ -1,9 +0,0 @@ -api=workqueue - -expected=("Transforming one table" "Done. Writing output to") -NUM_DESIRED_OUTPUTS=0 - -# the default is --yaml. we don't want that -source_from=" " - -up_args='--gunzip "$TEST_PATH"/pail/test-data/sm/input/test_01.parquet.gz "$TEST_PATH"/pail/test-data/sm/input/test_02.parquet.gz "$TEST_PATH"/pail/test-data/sm/input/test_03.parquet.gz' diff --git a/tests/tests/python-language-pdf2parquet/pail/.helmignore b/tests/tests/python-language-pdf2parquet/pail/.helmignore deleted file mode 100644 index 6350f0fe2..000000000 --- a/tests/tests/python-language-pdf2parquet/pail/.helmignore +++ /dev/null @@ -1 +0,0 @@ -test-data/ diff --git a/tests/tests/python-language-pdf2parquet/post.sh b/tests/tests/python-language-pdf2parquet/post.sh deleted file mode 100755 index 004b05cf3..000000000 --- a/tests/tests/python-language-pdf2parquet/post.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash - -DATA="$TEST_PATH"/pail/test-data - -for i in "$DATA"/input/* -do - b=$(basename $i) - if [[ "$b" =~ "output" ]] - then continue - fi - - ext=${b##*.} - bb=${b%%.*} - actual="$(dirname $i)"/"$bb".output.$ext - expected="$DATA"/expected/$bb.parquet.gz - - while true - do - if [ -f $actual ] - then echo "✅ PASS found local task output file=$actual test=$TEST_NAME" && break - else echo "Still waiting for local task output file=$actual test=$TEST_NAME" && sleep 1 - fi - done - - actual_sha256=$(cat "$actual" | sha256sum) - expected_sha256=$(gunzip -c "$expected" | sha256sum) - - if [ "$actual_sha256" = "$expected_sha256" ] - then echo "✅ PASS the output file is valid file=$actual test=$TEST_NAME" - else echo "❌ FAIL mismatched sha256 on output file file=$actual expected=$expected actual_sha256=$actual_sha256 expected_sha256=$expected_sha256 test=$TEST_NAME" && exit 1 - fi - - rm -f "$actual" -done diff --git a/tests/tests/python-language-pdf2parquet/settings.sh b/tests/tests/python-language-pdf2parquet/settings.sh deleted file mode 100644 index 6aa408f8c..000000000 --- a/tests/tests/python-language-pdf2parquet/settings.sh +++ /dev/null @@ -1,9 +0,0 @@ -api=workqueue - -expected=("Done with nrows=1 nsuccess=1 nfail=0 nskip=0" "Done with nrows=2 nsuccess=2 nfail=0 nskip=0") -NUM_DESIRED_OUTPUTS=0 - -# the default is --yaml. we don't want that -source_from=" " - -up_args='"$TEST_PATH"/pail/test-data/input/redp5110-ch1.pdf "$TEST_PATH"/pail/test-data/input/archive1.zip' diff --git a/tests/tests/python-language-pii-redactor/pail/.helmignore b/tests/tests/python-language-pii-redactor/pail/.helmignore deleted file mode 100644 index 6350f0fe2..000000000 --- a/tests/tests/python-language-pii-redactor/pail/.helmignore +++ /dev/null @@ -1 +0,0 @@ -test-data/ diff --git a/tests/tests/python-language-pii-redactor/pail/test-data/sm/pii_test_data.expected.parquet.gz b/tests/tests/python-language-pii-redactor/pail/test-data/sm/pii_test_data.expected.parquet.gz deleted file mode 100644 index 6bff449a1..000000000 Binary files a/tests/tests/python-language-pii-redactor/pail/test-data/sm/pii_test_data.expected.parquet.gz and /dev/null differ diff --git a/tests/tests/python-language-pii-redactor/post.sh b/tests/tests/python-language-pii-redactor/post.sh deleted file mode 100755 index d2381da89..000000000 --- a/tests/tests/python-language-pii-redactor/post.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/sh - -DATA="$TEST_PATH"/pail/test-data - -actual=1.parquet -expected="$DATA"/xs/1.expected.parquet.gz - -while true -do - if [ -f $actual ] - then echo "✅ PASS found local task output file=$actual test=$TEST_NAME" && break - else echo "Still waiting for local task output file=$actual test=$TEST_NAME" && sleep 1 - fi -done - -actual_sha256=$(cat "$actual" | sha256sum) -expected_sha256=$(gunzip -c "$expected" | sha256sum) - -if [ "$actual_sha256" = "$expected_sha256" ] -then echo "✅ PASS the output file is valid file=$actual test=$TEST_NAME" -else echo "❌ FAIL mismatched sha256 on output file file=$actual actual_sha256=$actual_sha256 expected_sha256=$expected_sha256 test=$TEST_NAME" && exit 1 -fi - -rm -f "$actual" diff --git a/tests/tests/python-language-pii-redactor/settings.sh b/tests/tests/python-language-pii-redactor/settings.sh deleted file mode 100644 index e5fc448a1..000000000 --- a/tests/tests/python-language-pii-redactor/settings.sh +++ /dev/null @@ -1,9 +0,0 @@ -api=workqueue - -expected=("SequenceTagger predicts" "Done. Writing output to") -NUM_DESIRED_OUTPUTS=0 - -# the default is --yaml. we don't want that -source_from=" " - -up_args='--gunzip "$TEST_PATH"/pail/test-data/xs/1.parquet.gz' diff --git a/tests/tests/python-language-text-encoder/pail/.helmignore b/tests/tests/python-language-text-encoder/pail/.helmignore deleted file mode 100644 index 6350f0fe2..000000000 --- a/tests/tests/python-language-text-encoder/pail/.helmignore +++ /dev/null @@ -1 +0,0 @@ -test-data/ diff --git a/tests/tests/python-language-text-encoder/post.sh b/tests/tests/python-language-text-encoder/post.sh deleted file mode 100755 index f5bc30fcc..000000000 --- a/tests/tests/python-language-text-encoder/post.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env bash - -DATA="$TEST_PATH"/pail/test-data - -for i in "$DATA"/input/* -do - b=$(basename $i) - if [[ "$b" =~ "output" ]] - then continue - fi - - ext=${b##*.} - bb=${b%%.*} - actual=$bb.parquet - expected="$DATA"/expected/$bb.parquet.gz - - while true - do - if [ -f $actual ] - then echo "✅ PASS found local task output file=$actual test=$TEST_NAME" && break - else echo "Still waiting for local task output file=$actual test=$TEST_NAME" && sleep 1 - fi - done - - actual_sha256=$(cat "$actual" | sha256sum) - expected_sha256=$(gunzip -c "$expected" | sha256sum) - - # ugh, we cannot currently compare the output contents due to an upstream bug - # https://github.com/IBM/data-prep-kit/issues/483 - if [ "$actual_sha256" = "$expected_sha256" ] - then echo "✅ PASS the output file is valid file=$actual test=$TEST_NAME" - else echo "❌ FAIL (but ignoring for now) mismatched sha256 on output file file=$actual expected=$expected actual_sha256=$actual_sha256 expected_sha256=$expected_sha256 test=$TEST_NAME" # && exit 1 - fi - - rm -f "$actual" -done diff --git a/tests/tests/python-language-text-encoder/settings.sh b/tests/tests/python-language-text-encoder/settings.sh deleted file mode 100644 index 9b68716d7..000000000 --- a/tests/tests/python-language-text-encoder/settings.sh +++ /dev/null @@ -1,9 +0,0 @@ -api=workqueue - -expected=("Done with nfiles=1 nrows=2") -NUM_DESIRED_OUTPUTS=0 - -# the default is --yaml. we don't want that -source_from=" " - -up_args='--gunzip "$TEST_PATH"/pail/test-data/input/test1.parquet.gz' diff --git a/tests/tests/python-universal-doc-id/pail/.helmignore b/tests/tests/python-universal-doc-id/pail/.helmignore deleted file mode 100644 index 6350f0fe2..000000000 --- a/tests/tests/python-universal-doc-id/pail/.helmignore +++ /dev/null @@ -1 +0,0 @@ -test-data/ diff --git a/tests/tests/python-universal-doc-id/post.sh b/tests/tests/python-universal-doc-id/post.sh deleted file mode 100755 index a1ea4e576..000000000 --- a/tests/tests/python-universal-doc-id/post.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env bash - -DATA="$TEST_PATH"/pail/test-data - -function validate { - actual="$1" - expected="$2" - - while true - do - if [ -f $actual ] - then echo "✅ PASS found local task output file=$actual test=$TEST_NAME" && break - else echo "Still waiting for local task output file=$actual test=$TEST_NAME" && sleep 1 - fi - done - -# actual_sha256=$(cat "$actual" | sha256sum) -# expected_sha256=$(gunzip -c "$expected" | sha256sum) -# -# if [ "$actual_sha256" = "$expected_sha256" ] -# then echo "✅ PASS the output file is valid file=$actual test=$TEST_NAME" -# else echo "❌ FAIL mismatched sha256 on output file file=$actual actual_sha256=$actual_sha256 expected_sha256=$expected_sha256 test=$TEST_NAME" && exit 1 -# fi - - rm -f "$actual" -} - -validate sample1.parquet "$DATA"/expected/sample1.parquet.gz diff --git a/tests/tests/python-universal-doc-id/settings.sh b/tests/tests/python-universal-doc-id/settings.sh deleted file mode 100644 index c937fdbf4..000000000 --- a/tests/tests/python-universal-doc-id/settings.sh +++ /dev/null @@ -1,9 +0,0 @@ -api=workqueue - -expected=("input table has 5 rows" "output table has 5 rows") -NUM_DESIRED_OUTPUTS=0 - -# the default is --yaml. we don't want that -source_from=" " - -up_args='--gunzip "$TEST_PATH"/pail/test-data/input/sample1.parquet.gz' diff --git a/tests/tests/python-universal-ededup/pail/.helmignore b/tests/tests/python-universal-ededup/pail/.helmignore deleted file mode 100644 index 6350f0fe2..000000000 --- a/tests/tests/python-universal-ededup/pail/.helmignore +++ /dev/null @@ -1 +0,0 @@ -test-data/ diff --git a/tests/tests/python-universal-ededup/post.sh b/tests/tests/python-universal-ededup/post.sh deleted file mode 100755 index 90b113adc..000000000 --- a/tests/tests/python-universal-ededup/post.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env bash - -DATA="$TEST_PATH"/pail/test-data - -function validate { - actual="$1" - expected="$2" - - while true - do - if [ -f $actual ] - then echo "✅ PASS found local task output file=$actual test=$TEST_NAME" && break - else echo "Still waiting for local task output file=$actual test=$TEST_NAME" && sleep 1 - fi - done - -# actual_sha256=$(cat "$actual" | sha256sum) -# expected_sha256=$(gunzip -c "$expected" | sha256sum) - -# if [ "$actual_sha256" = "$expected_sha256" ] -# then echo "✅ PASS the output file is valid file=$actual test=$TEST_NAME" -# else echo "❌ FAIL mismatched sha256 on output file file=$actual actual_sha256=$actual_sha256 expected_sha256=$expected_sha256 test=$TEST_NAME" && exit 1 -# fi - - rm -f "$actual" -} - -validate sample1.parquet "$DATA"/expected/sample1.parquet.gz diff --git a/tests/tests/python-universal-ededup/settings.sh b/tests/tests/python-universal-ededup/settings.sh deleted file mode 100644 index e3dc3f5aa..000000000 --- a/tests/tests/python-universal-ededup/settings.sh +++ /dev/null @@ -1,9 +0,0 @@ -api=workqueue - -expected=("input table has 5 rows and 38 columns" "output table has 3 rows and 39 columns" "output table has 3 rows and 39 columns") -NUM_DESIRED_OUTPUTS=0 - -# the default is --yaml. we don't want that -source_from=" " - -up_args='--gunzip "$TEST_PATH"/pail/test-data/input/sample1.parquet.gz' diff --git a/tests/tests/python-universal-filter/pail/.helmignore b/tests/tests/python-universal-filter/pail/.helmignore deleted file mode 100644 index 6350f0fe2..000000000 --- a/tests/tests/python-universal-filter/pail/.helmignore +++ /dev/null @@ -1 +0,0 @@ -test-data/ diff --git a/tests/tests/python-universal-filter/post.sh b/tests/tests/python-universal-filter/post.sh deleted file mode 100755 index 8f6f84762..000000000 --- a/tests/tests/python-universal-filter/post.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env bash - -DATA="$TEST_PATH"/pail/test-data - -function validate { - actual="$1" - expected="$2" - - while true - do - if [ -f $actual ] - then echo "✅ PASS found local task output file=$actual test=$TEST_NAME" && break - else echo "Still waiting for local task output file=$actual test=$TEST_NAME" && sleep 1 - fi - done - - actual_sha256=$(cat "$actual" | sha256sum) - expected_sha256=$(gunzip -c "$expected" | sha256sum) - - if [ "$actual_sha256" = "$expected_sha256" ] - then echo "✅ PASS the output file is valid file=$actual test=$TEST_NAME" - else echo "❌ FAIL mismatched sha256 on output file file=$actual actual_sha256=$actual_sha256 expected_sha256=$expected_sha256 test=$TEST_NAME" && exit 1 - fi - - rm -f "$actual" -} - -validate test1.parquet "$DATA"/expected/test1.parquet.gz diff --git a/tests/tests/python-universal-filter/settings.sh b/tests/tests/python-universal-filter/settings.sh deleted file mode 100644 index fcb216b0b..000000000 --- a/tests/tests/python-universal-filter/settings.sh +++ /dev/null @@ -1,9 +0,0 @@ -api=workqueue - -expected=("Done with docs_after_filter=100 columns_after_filter=25 bytes_after_filter=478602") -NUM_DESIRED_OUTPUTS=0 - -# the default is --yaml. we don't want that -source_from=" " - -up_args='--gunzip "$TEST_PATH"/pail/test-data/input/test1.parquet.gz' diff --git a/tests/tests/python-universal-resize/pail/.helmignore b/tests/tests/python-universal-resize/pail/.helmignore deleted file mode 100644 index 6350f0fe2..000000000 --- a/tests/tests/python-universal-resize/pail/.helmignore +++ /dev/null @@ -1 +0,0 @@ -test-data/ diff --git a/tests/tests/python-universal-resize/post.sh b/tests/tests/python-universal-resize/post.sh deleted file mode 100755 index 9dc99867b..000000000 --- a/tests/tests/python-universal-resize/post.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env bash - -DATA="$TEST_PATH"/pail/test-data - -function validate { - actual="$1" - expected="$2" - - while true - do - if [ -f $actual ] - then echo "✅ PASS found local task output file=$actual test=$TEST_NAME" && break - else echo "Still waiting for local task output file=$actual test=$TEST_NAME" && sleep 1 - fi - done - - if [ ! -e "$expected" ] - then echo "❌ FAIL cannot find expected output file $expected test=$TEST_NAME" && exit 1 - fi - - actual_sha256=$(cat "$actual" | sha256sum) - expected_sha256=$(gunzip -c "$expected" | sha256sum) - - if [ "$actual_sha256" = "$expected_sha256" ] - then echo "✅ PASS the output file is valid file=$actual test=$TEST_NAME" - else echo "❌ FAIL mismatched sha256 on output file file=$actual actual_sha256=$actual_sha256 expected_sha256=$expected_sha256 test=$TEST_NAME" && exit 1 - fi - - rm -f "$actual" -} - -validate test1_0.parquet "$DATA"/expected/test1_0.parquet.gz -validate test1_1.parquet "$DATA"/expected/test1_1.parquet.gz -validate test2_0.parquet "$DATA"/expected/test2_0.parquet.gz -validate test2_1.parquet "$DATA"/expected/test2_1.parquet.gz -validate test3_0.parquet "$DATA"/expected/test3_0.parquet.gz -validate test3_1.parquet "$DATA"/expected/test3_1.parquet.gz diff --git a/tests/tests/python-universal-resize/settings.sh b/tests/tests/python-universal-resize/settings.sh deleted file mode 100644 index 175645e4c..000000000 --- a/tests/tests/python-universal-resize/settings.sh +++ /dev/null @@ -1,9 +0,0 @@ -api=workqueue - -expected=("max bytes = 0" "max rows = 125" "got new table with 200 rows" "flushing buffered table with 75 rows of size 82627") -NUM_DESIRED_OUTPUTS=0 - -# the default is --yaml. we don't want that -source_from=" " - -up_args='-e max_rows_per_table=125 --gunzip "$TEST_PATH"/pail/test-data/input/test1.parquet.gz "$TEST_PATH"/pail/test-data/input/test2.parquet.gz "$TEST_PATH"/pail/test-data/input/test3.parquet.gz' diff --git a/tests/tests/python-universal-tokenization/pail/.helmignore b/tests/tests/python-universal-tokenization/pail/.helmignore deleted file mode 100644 index 6350f0fe2..000000000 --- a/tests/tests/python-universal-tokenization/pail/.helmignore +++ /dev/null @@ -1 +0,0 @@ -test-data/ diff --git a/tests/tests/python-universal-tokenization/pail/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet.gz b/tests/tests/python-universal-tokenization/pail/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet.gz deleted file mode 100644 index 2651457b3..000000000 Binary files a/tests/tests/python-universal-tokenization/pail/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet.gz and /dev/null differ diff --git a/tests/tests/python-universal-tokenization/pail/test-data/ds01/expected/metadata.json b/tests/tests/python-universal-tokenization/pail/test-data/ds01/expected/metadata.json deleted file mode 100644 index e6c190807..000000000 --- a/tests/tests/python-universal-tokenization/pail/test-data/ds01/expected/metadata.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "pipeline": "pipeline_id", - "job details": { - "job category": "preprocessing", - "job name": "Tokenization", - "job type": "ray", - "job id": "job_id", - "start_time": "2024-03-29 13:30:56", - "end_time": "2024-03-29 13:30:57", - "status": "success" - }, - "code": { - "github": "github", - "commit_hash": "12345", - "path": "path" - }, - "job_input_params": { - "tokenizer": "hf-internal-testing/llama-tokenizer", - "tokenizer_args": null, - "doc_id_column": "document_id", - "doc_content_column": "contents", - "text_lang": "en", - "chunk_size": 0, - "checkpointing": false, - "max_files": -1, - "number of workers": 5, - "worker options": { - "num_cpus": 0.8 - }, - "actor creation delay": 0 - }, - "execution_stats": { - "cpus": 10, - "gpus": 0, - "memory": 27.31659088190645, - "object_store": 2.0 - }, - "job_output_stats": { - "source_files": 5, - "source_size": 450, - "result_files": 3, - "result_size": 842, - "table_processing": 0.03880786895751953, - "num_files": 3, - "num_rows": 6, - "num_tokenized_rows": 6, - "num_tokens": 85, - "num_chars": 384, - "skipped empty tables": 2 - }, - "source": { - "name": "/Users/xdang/00proj/04-FM/01_code/fm-data-engineering/transforms/universal/tokenization/test-data/ds01/input", - "type": "path" - }, - "target": { - "name": "/Users/xdang/00proj/04-FM/01_code/fm-data-engineering/transforms/universal/tokenization/output/ds01", - "type": "path" - } -} diff --git a/tests/tests/python-universal-tokenization/pail/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet.gz b/tests/tests/python-universal-tokenization/pail/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet.gz deleted file mode 100644 index 78089a60e..000000000 Binary files a/tests/tests/python-universal-tokenization/pail/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet.gz and /dev/null differ diff --git a/tests/tests/python-universal-tokenization/pail/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet.gz b/tests/tests/python-universal-tokenization/pail/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet.gz deleted file mode 100644 index 08198e964..000000000 Binary files a/tests/tests/python-universal-tokenization/pail/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet.gz and /dev/null differ diff --git a/tests/tests/python-universal-tokenization/pail/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet.gz b/tests/tests/python-universal-tokenization/pail/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet.gz deleted file mode 100644 index 7c22d09a0..000000000 Binary files a/tests/tests/python-universal-tokenization/pail/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet.gz and /dev/null differ diff --git a/tests/tests/python-universal-tokenization/pail/test-data/ds01/input/lang=en/pq01.output.parquet.gz b/tests/tests/python-universal-tokenization/pail/test-data/ds01/input/lang=en/pq01.output.parquet.gz deleted file mode 100644 index 8fe5b1f56..000000000 Binary files a/tests/tests/python-universal-tokenization/pail/test-data/ds01/input/lang=en/pq01.output.parquet.gz and /dev/null differ diff --git a/tests/tests/python-universal-tokenization/pail/test-data/ds02/expected/df_17m.parquet.gz b/tests/tests/python-universal-tokenization/pail/test-data/ds02/expected/df_17m.parquet.gz deleted file mode 100644 index 5b5a0d51c..000000000 Binary files a/tests/tests/python-universal-tokenization/pail/test-data/ds02/expected/df_17m.parquet.gz and /dev/null differ diff --git a/tests/tests/python-universal-tokenization/pail/test-data/ds02/expected/metadata.json b/tests/tests/python-universal-tokenization/pail/test-data/ds02/expected/metadata.json deleted file mode 100644 index dc9813beb..000000000 --- a/tests/tests/python-universal-tokenization/pail/test-data/ds02/expected/metadata.json +++ /dev/null @@ -1,58 +0,0 @@ -{ - "pipeline": "pipeline_id", - "job details": { - "job category": "preprocessing", - "job name": "Tokenization", - "job type": "ray", - "job id": "job_id", - "start_time": "2024-03-29 14:03:15", - "end_time": "2024-03-29 14:03:32", - "status": "success" - }, - "code": { - "github": "github", - "commit_hash": "12345", - "path": "path" - }, - "job_input_params": { - "tokenizer": "hf-internal-testing/llama-tokenizer", - "tokenizer_args": null, - "doc_id_column": "document_id", - "doc_content_column": "contents", - "text_lang": "en", - "chunk_size": 20000, - "checkpointing": false, - "max_files": -1, - "number of workers": 5, - "worker options": { - "num_cpus": 0.8 - }, - "actor creation delay": 0 - }, - "execution_stats": { - "cpus": 10, - "gpus": 0, - "memory": 27.180484008975327, - "object_store": 2.0 - }, - "job_output_stats": { - "source_files": 1, - "source_size": 16863266, - "result_files": 1, - "result_size": 37109764, - "table_processing": 15.886597871780396, - "num_files": 1, - "num_rows": 1, - "num_tokenized_rows": 1, - "num_tokens": 4638717, - "num_chars": 16836009 - }, - "source": { - "name": "/Users/xdang/00proj/04-FM/01_code/fm-data-engineering/transforms/universal/tokenization/test-data/ds02/input", - "type": "path" - }, - "target": { - "name": "/Users/xdang/00proj/04-FM/01_code/fm-data-engineering/transforms/universal/tokenization/output/ds02", - "type": "path" - } -} diff --git a/tests/tests/python-universal-tokenization/pail/test-data/ds02/input/df_17m.parquet.gz b/tests/tests/python-universal-tokenization/pail/test-data/ds02/input/df_17m.parquet.gz deleted file mode 100644 index d5acbcbad..000000000 Binary files a/tests/tests/python-universal-tokenization/pail/test-data/ds02/input/df_17m.parquet.gz and /dev/null differ diff --git a/tests/tests/python-universal-tokenization/post.sh b/tests/tests/python-universal-tokenization/post.sh deleted file mode 100755 index d00f62d3b..000000000 --- a/tests/tests/python-universal-tokenization/post.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env bash - -DATA="$TEST_PATH"/pail/test-data/ds01 - -function validate { - actual="$1" - expected="$2" - - while true - do - if [ -f $actual ] - then echo "✅ PASS found local task output file=$actual test=$TEST_NAME" && break - else echo "Still waiting for local task output file=$actual test=$TEST_NAME" && sleep 1 - fi - done - - actual_sha256=$(cat "$actual" | sha256sum) - expected_sha256=$(gunzip -c "$expected" | sha256sum) - - if [ "$actual_sha256" = "$expected_sha256" ] - then echo "✅ PASS the output file is valid file=$actual test=$TEST_NAME" - else echo "❌ FAIL mismatched sha256 on output file file=$actual actual_sha256=$actual_sha256 expected_sha256=$expected_sha256 test=$TEST_NAME" && exit 1 - fi - - rm -f "$actual" -} - -validate task.1.txt "$DATA"/expected/lang=en/pq01.parquet.gz -validate task.2.txt "$DATA"/expected/lang=en/pq02.parquet.gz diff --git a/tests/tests/python-universal-tokenization/settings.sh b/tests/tests/python-universal-tokenization/settings.sh deleted file mode 100644 index 7ed44be5c..000000000 --- a/tests/tests/python-universal-tokenization/settings.sh +++ /dev/null @@ -1,9 +0,0 @@ -api=workqueue - -expected=("Done with num_files=1 num_rows=3 num_tokenized_rows=3 num_empty_rows=0 num_tokens=45 num_chars=193" "Done with num_files=1 num_rows=2 num_tokenized_rows=2 num_empty_rows=0 num_tokens=28 num_chars=132") -NUM_DESIRED_OUTPUTS=0 - -# the default is --yaml. we don't want that -source_from=" " - -up_args='<(gunzip -c "$TEST_PATH"/pail/test-data/ds01/input/lang=en/pq01.parquet.gz) <(gunzip -c "$TEST_PATH"/pail/test-data/ds01/input/lang=en/pq02.parquet.gz)'