diff --git a/examples/demo_with_launcher.ipynb b/examples/demo_with_launcher.ipynb index cf7c67219..81563638d 100644 --- a/examples/demo_with_launcher.ipynb +++ b/examples/demo_with_launcher.ipynb @@ -64,7 +64,7 @@ "import os\n", "import sys\n", "\n", - "from data_processing.runtime.ray import RayTransformLauncher\n", + "from data_processing_ray.runtime.ray import RayTransformLauncher\n", "from data_processing.utils import ParamsUtils" ] }, @@ -78,7 +78,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 2, "id": "60ac8bee-0960-4309-b225-d7a211b14262", "metadata": {}, "outputs": [], @@ -87,7 +87,6 @@ "# We can set input paths here\n", "zip_input_folder = \"input_data\"\n", "\n", - "\n", "if not os.path.exists(zip_input_folder):\n", " print (\"NO INPUT DATA\")\n", " print (\"Please set `zip_input_folder` variable to path containing data\")\n", @@ -134,7 +133,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 3, "id": "482605b2-d814-456d-9195-49a2ec454ef0", "metadata": {}, "outputs": [], @@ -155,7 +154,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 4, "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", "metadata": {}, "outputs": [ @@ -163,9 +162,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "15:47:27 INFO - data factory data_ is using local data access: input_folder - input_data output_folder - test-data/parquet_input\n", - "15:47:27 INFO - data factory data_ max_files -1, n_sample -1\n", - "15:47:27 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.zip']\n" + "23:31:45 INFO - data factory data_ is using local data access: input_folder - input_data output_folder - test-data/parquet_input\n", + "23:31:45 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:31:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.zip'], files to checkpoint ['.parquet']\n" ] }, { @@ -173,92 +172,24 @@ "output_type": "stream", "text": [ "Number of files is 1 \n", - "filepath /Users/himapatel/Work/Projects/MCD/OpenSource/Demo/data-prep-kit/tools/ingest2parquet/src/utils/lang_extensions.json\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x88 in position 119: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x88 in position 119: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x88 in position 119: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x88 in position 119: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x88 in position 119: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x88 in position 119: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x88 in position 119: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x88 in position 119: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x88 in position 119: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xf8 in position 7: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x88 in position 119: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xf8 in position 7: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xac in position 7: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x8c in position 7: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xfd in position 10: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xb5 in position 10: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xc4 in position 7: invalid continuation byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xd8 in position 7: invalid continuation byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xc4 in position 7: invalid continuation byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xd8 in position 7: invalid continuation byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xb0 in position 7: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xb0 in position 7: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x98 in position 77: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x98 in position 77: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x88 in position 119: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x88 in position 119: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xe2 in position 114: invalid continuation byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x88 in position 119: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xe2 in position 114: invalid continuation byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x88 in position 119: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xb8 in position 7: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xb8 in position 7: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xe0 in position 7: invalid continuation byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xe0 in position 7: invalid continuation byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xdc in position 7: invalid continuation byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xdc in position 7: invalid continuation byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xca in position 7: invalid continuation byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xca in position 7: invalid continuation byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xa0 in position 7: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xa0 in position 7: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xf8 in position 7: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xf8 in position 7: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xf8 in position 7: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xb0 in position 7: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xb0 in position 7: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x80 in position 7: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x8a in position 70: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xa0 in position 81: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xa0 in position 81: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xa8 in position 85: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0x98 in position 77: invalid start byte\n", - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xc0 in position 7: invalid start byte\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + "filepath /Users/shivdeep/workspace/projects/current/oss-data-prep/test-ingest2parquet/data-prep-kit/tools/ingest2parquet/src/utils/lang_extensions.json\n", + " skipping data-prep-kit-dev/data-processing-lib/doc/foundation-classes.jpg No contents decoded\n", " skipping data-prep-kit-dev/data-processing-lib/doc/processing-architecture.jpg No contents decoded\n", - " skipping data-prep-kit-dev/data-processing-lib/ray/src/data_processing/__init__.py No contents decoded\n", - " skipping data-prep-kit-dev/data-processing-lib/ray/src/data_processing/test_support/launch/__init__.py No contents decoded\n", + " skipping data-prep-kit-dev/data-processing-lib/python/src/data_processing/__init__.py No contents decoded\n", + " skipping data-prep-kit-dev/data-processing-lib/python/src/data_processing/test_support/launch/__init__.py No contents decoded\n", + " skipping data-prep-kit-dev/data-processing-lib/python/test-data/data_processing/daf/input/ds1/sample1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/data-processing-lib/python/test-data/data_processing/daf/input/ds1/sample2.parquet No contents decoded\n", + " skipping data-prep-kit-dev/data-processing-lib/python/test-data/data_processing/daf/input/ds2/sample3.parquet No contents decoded\n", + " skipping data-prep-kit-dev/data-processing-lib/python/test-data/data_processing/daf/output/ds1/sample1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/data-processing-lib/python/test-data/data_processing/input/sample1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/data-processing-lib/python/test-data/data_processing/input_multiple/sample1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/data-processing-lib/python/test-data/data_processing/input_multiple/sample2.parquet No contents decoded\n", + " skipping data-prep-kit-dev/data-processing-lib/python/test-data/data_processing/input_multiple/sample3.parquet No contents decoded\n", + " skipping data-prep-kit-dev/data-processing-lib/python/test-data/data_processing/python/noop/expected/sample1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/data-processing-lib/python/test-data/data_processing/python/noop/expected/subdir/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/data-processing-lib/python/test-data/data_processing/python/noop/input/sample1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/data-processing-lib/python/test-data/data_processing/python/noop/input/subdir/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/data-processing-lib/ray/src/data_processing_ray/test_support/__init__.py No contents decoded\n", " skipping data-prep-kit-dev/data-processing-lib/ray/test-data/data_processing/daf/input/ds1/sample1.parquet No contents decoded\n", " skipping data-prep-kit-dev/data-processing-lib/ray/test-data/data_processing/daf/input/ds1/sample2.parquet No contents decoded\n", " skipping data-prep-kit-dev/data-processing-lib/ray/test-data/data_processing/daf/input/ds2/sample3.parquet No contents decoded\n", @@ -271,6 +202,10 @@ " skipping data-prep-kit-dev/data-processing-lib/ray/test-data/data_processing/ray/noop/expected/subdir/test1.parquet No contents decoded\n", " skipping data-prep-kit-dev/data-processing-lib/ray/test-data/data_processing/ray/noop/input/sample1.parquet No contents decoded\n", " skipping data-prep-kit-dev/data-processing-lib/ray/test-data/data_processing/ray/noop/input/subdir/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/data-processing-lib/spark/test-data/data_processing/spark/noop/expected/sample1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/data-processing-lib/spark/test-data/data_processing/spark/noop/expected/subdir/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/data-processing-lib/spark/test-data/data_processing/spark/noop/input/sample1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/data-processing-lib/spark/test-data/data_processing/spark/noop/input/subdir/test1.parquet No contents decoded\n", " skipping data-prep-kit-dev/doc/data-flow.jpg No contents decoded\n", " skipping data-prep-kit-dev/doc/data-transformation.jpg No contents decoded\n", " skipping data-prep-kit-dev/examples/requirements.txt No contents decoded\n", @@ -282,64 +217,120 @@ " skipping data-prep-kit-dev/kfp/doc/param_list1.png No contents decoded\n", " skipping data-prep-kit-dev/kfp/doc/param_list2.png No contents decoded\n", " skipping data-prep-kit-dev/kfp/doc/podman_vm_settings.png No contents decoded\n", + " skipping data-prep-kit-dev/kfp/doc/super-code-pipeline.png No contents decoded\n", " skipping data-prep-kit-dev/kfp/doc/super_pipeline.png No contents decoded\n", + " skipping data-prep-kit-dev/kfp/kfp_support_lib/kfp_v2_workflow_support/__init__.py No contents decoded\n", " skipping data-prep-kit-dev/tools/ingest2parquet/test-data/expected/application-java.parquet No contents decoded\n", " skipping data-prep-kit-dev/tools/ingest2parquet/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet No contents decoded\n", " skipping data-prep-kit-dev/tools/ingest2parquet/test-data/input/application-java.zip No contents decoded\n", " skipping data-prep-kit-dev/tools/ingest2parquet/test-data/input/https___github.com_00000o1_environments_archive_refs_heads_master.zip No contents decoded\n", - " skipping data-prep-kit-dev/transforms/code/code_quality/test-data/expected/sample_1.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/code/code_quality/test-data/expected/sample_2.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/code/code_quality/test-data/input/sample_1.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/code/code_quality/test-data/input/sample_2.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/code/malware/test-data/expected/sample.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/code/malware/test-data/input/sample.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/code/proglang_select/test-data/expected/test1.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/code/proglang_select/test-data/input/test1.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/doc_id/test-data/expected/sample1.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/doc_id/test-data/input/sample1.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/ededup/images/exactdedup.png No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/ededup/test-data/expected/sample1.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/ededup/test-data/input/sample1.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/fdedup/images/fuzzy.png No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/fdedup/test-data/expected/sample1.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/fdedup/test-data/expected/snapshot/buckets/buckets_collector_0 No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/fdedup/test-data/expected/snapshot/docs/doc_collector_0 No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/fdedup/test-data/expected/snapshot/docs/doc_collector_1 No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/fdedup/test-data/expected/snapshot/minhash/minhash_collector_0 No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/fdedup/test-data/input/sample1.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/filter/test-data/expected/test-and-local/test1.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/filter/test-data/expected/test-and/test1.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/filter/test-data/expected/test-datetime-like-local/test1.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/filter/test-data/expected/test-datetime-like/test1.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/filter/test-data/expected/test-default-local/test1.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/filter/test-data/expected/test-default/test1.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/filter/test-data/expected/test-in-local/test1.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/filter/test-data/expected/test-in/test1.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/filter/test-data/expected/test-or-local/test1.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/filter/test-data/expected/test-or/test1.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/filter/test-data/input/test1.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/noop/test-data/expected/test1.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/noop/test-data/input/test1.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/tokenization/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/tokenization/test-data/ds01/expected/lang=en/pq01.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/tokenization/test-data/ds01/expected/lang=en/pq02.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/tokenization/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/tokenization/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/tokenization/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/tokenization/test-data/ds01/input/lang=en/pq01.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/tokenization/test-data/ds01/input/lang=en/pq02.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/tokenization/test-data/ds02/expected/df_17m.parquet No contents decoded\n", - " skipping data-prep-kit-dev/transforms/universal/tokenization/test-data/ds02/input/df_17m.parquet No contents decoded\n", - "processing stats generated {'total_files_given': 1, 'total_files_processed': 1, 'total_files_failed_to_processed': 0, 'total_no_of_rows': 361, 'total_bytes_in_memory': 1355605, 'failure_details': []}\n", + " skipping data-prep-kit-dev/transforms/code/code_quality/ray/test-data/expected/sample_1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/code/code_quality/ray/test-data/expected/sample_2.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/code/code_quality/ray/test-data/input/sample_1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/code/code_quality/ray/test-data/input/sample_2.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/code/ingest_2_parquet/ray/test-data/expected/application-java.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/code/ingest_2_parquet/ray/test-data/expected/data-processing-lib.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/code/ingest_2_parquet/ray/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/code/ingest_2_parquet/ray/test-data/input/application-java.zip No contents decoded\n", + " skipping data-prep-kit-dev/transforms/code/ingest_2_parquet/ray/test-data/input/data-processing-lib.zip No contents decoded\n", + " skipping data-prep-kit-dev/transforms/code/ingest_2_parquet/ray/test-data/input/https___github.com_00000o1_environments_archive_refs_heads_master.zip No contents decoded\n", + " skipping data-prep-kit-dev/transforms/code/malware/ray/test-data/expected/sample.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/code/malware/ray/test-data/input/sample.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/code/proglang_select/ray/test-data/expected/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/code/proglang_select/ray/test-data/input/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/language/lang_id/python/test-data/expected/test_01.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/language/lang_id/python/test-data/expected/test_02.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/language/lang_id/python/test-data/expected/test_03.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/language/lang_id/python/test-data/input/test_01.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/language/lang_id/python/test-data/input/test_02.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/language/lang_id/python/test-data/input/test_03.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/language/lang_id/ray/test-data/expected/test_01.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/language/lang_id/ray/test-data/expected/test_02.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/language/lang_id/ray/test-data/expected/test_03.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/language/lang_id/ray/test-data/input/test_01.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/language/lang_id/ray/test-data/input/test_02.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/language/lang_id/ray/test-data/input/test_03.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/doc_id/ray/test-data/expected/sample1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/doc_id/ray/test-data/input/sample1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/doc_id/spark/test-data/expected/_SUCCESS No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/doc_id/spark/test-data/expected/part-00000-3a598f8d-7760-4d8d-8b42-f9bc657031d1-c000.snappy.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/doc_id/spark/test-data/expected/part-00001-3a598f8d-7760-4d8d-8b42-f9bc657031d1-c000.snappy.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/doc_id/spark/test-data/input/test_doc_id_1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/doc_id/spark/test-data/input/test_doc_id_2.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/ededup/ray/images/exactdedup.png No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/ededup/ray/test-data/expected/sample1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/ededup/ray/test-data/input/sample1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/fdedup/ray/images/fuzzy.png No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/fdedup/ray/test-data/expected/sample1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/fdedup/ray/test-data/expected/snapshot/buckets/buckets_collector_0 No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/fdedup/ray/test-data/expected/snapshot/docs/doc_collector_0 No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/fdedup/ray/test-data/expected/snapshot/docs/doc_collector_1 No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/fdedup/ray/test-data/expected/snapshot/minhash/minhash_collector_0 No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/fdedup/ray/test-data/input/sample1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/python/test-data/expected/test-and-local/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/python/test-data/expected/test-and/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/python/test-data/expected/test-datetime-like-local/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/python/test-data/expected/test-datetime-like/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/python/test-data/expected/test-default-local/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/python/test-data/expected/test-default/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/python/test-data/expected/test-in-local/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/python/test-data/expected/test-in/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/python/test-data/expected/test-or-local/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/python/test-data/expected/test-or/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/python/test-data/input/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/ray/test-data/expected/test-and-local/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/ray/test-data/expected/test-and/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/ray/test-data/expected/test-datetime-like-local/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/ray/test-data/expected/test-datetime-like/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/ray/test-data/expected/test-default-local/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/ray/test-data/expected/test-default/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/ray/test-data/expected/test-in-local/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/ray/test-data/expected/test-in/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/ray/test-data/expected/test-or-local/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/ray/test-data/expected/test-or/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/ray/test-data/input/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/spark/test-data/expected/test-and-local/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/spark/test-data/expected/test-and/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/spark/test-data/expected/test-datetime-like-local/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/spark/test-data/expected/test-datetime-like/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/spark/test-data/expected/test-default-local/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/spark/test-data/expected/test-default/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/spark/test-data/expected/test-in-local/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/spark/test-data/expected/test-in/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/spark/test-data/expected/test-or-local/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/spark/test-data/expected/test-or/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/filter/spark/test-data/input/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/noop/python/test-data/expected/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/noop/python/test-data/input/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/noop/ray/test-data/expected/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/noop/ray/test-data/input/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/noop/spark/test-data/expected/_SUCCESS No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/noop/spark/test-data/expected/part-00000-1054f038-14f3-4885-a4db-cd6b03176712-c000.snappy.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/noop/spark/test-data/input/test1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/profiler/ray/test-data/input/sample1.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/tokenization/python/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/tokenization/python/test-data/ds01/expected/lang=en/pq01.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/tokenization/python/test-data/ds01/expected/lang=en/pq02.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/tokenization/python/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/tokenization/python/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/tokenization/python/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/tokenization/python/test-data/ds01/input/lang=en/pq01.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/tokenization/python/test-data/ds01/input/lang=en/pq02.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/tokenization/python/test-data/ds02/expected/df_17m.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/tokenization/python/test-data/ds02/input/df_17m.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/pq01.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/pq02.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/pq01.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/pq02.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/tokenization/ray/test-data/ds02/expected/df_17m.parquet No contents decoded\n", + " skipping data-prep-kit-dev/transforms/universal/tokenization/ray/test-data/ds02/input/df_17m.parquet No contents decoded\n", + "processing stats generated {'total_files_given': 1, 'total_files_processed': 1, 'total_files_failed_to_processed': 0, 'total_no_of_rows': 589, 'total_bytes_in_memory': 2019261, 'failure_details': []}\n", "Metadata file stored - response: {'name': 'test-data/parquet_input/metadata.json', 'size': 327}\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "15:47:28 ERROR - Error -> 'utf-8' codec can't decode byte 0xa7 in position 21: invalid start byte\n" - ] } ], "source": [ @@ -392,7 +383,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 5, "id": "58a0e1f6-ff53-40aa-96b1-096ade4bd1c0", "metadata": {}, "outputs": [ @@ -425,7 +416,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 6, "id": "a624b2b2-faad-4325-ac7d-53a840f564ef", "metadata": { "scrolled": true @@ -435,25 +426,27 @@ "name": "stderr", "output_type": "stream", "text": [ - "15:48:33 INFO - Running locally\n", - "15:48:33 INFO - exact dedup params are {'hash_cpu': 0.5, 'num_hashes': 2, 'doc_column': 'contents'}\n", - "15:48:33 INFO - data factory data_ is using local data access: input_folder - test-data/parquet_input output_folder - test-data/ededup_out\n", - "15:48:33 INFO - data factory data_ max_files -1, n_sample -1\n", - "15:48:33 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet']\n", - "15:48:33 INFO - number of workers 3 worker options {'num_cpus': 0.8}\n", - "15:48:33 INFO - pipeline id pipeline_id; number workers 3\n", - "15:48:33 INFO - job details {'job category': 'preprocessing', 'job name': 'ededup', 'job type': 'ray', 'job id': 'job_id'}\n", - "15:48:33 INFO - code location None\n", - "15:48:33 INFO - actor creation delay 0\n", - "2024-05-14 15:48:37,185\tINFO worker.py:1715 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=53568)\u001b[0m 15:48:39 INFO - orchestrator started at 2024-05-14 15:48:39\n", - "\u001b[36m(orchestrate pid=53568)\u001b[0m 15:48:39 INFO - Number of files is 1, source profile {'max_file_size': 0.4422798156738281, 'min_file_size': 0.4422798156738281, 'total_file_size': 0.4422798156738281}\n", - "\u001b[36m(orchestrate pid=53568)\u001b[0m 15:48:39 INFO - Cluster resources: {'cpus': 16, 'gpus': 0, 'memory': 39.97669982910156, 'object_store': 2.0}\n", - "\u001b[36m(orchestrate pid=53568)\u001b[0m 15:48:39 INFO - Number of workers - 3 with {'num_cpus': 0.8} each\n", - "\u001b[36m(orchestrate pid=53568)\u001b[0m 15:48:39 INFO - Completed 0 files in 7.299582163492839e-06 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=53568)\u001b[0m 15:48:41 INFO - Completed processing in 0.034180982907613115 min\n", - "\u001b[36m(orchestrate pid=53568)\u001b[0m 15:48:41 INFO - done flushing in 0.002724170684814453 sec\n", - "15:48:51 INFO - Completed execution in 0.313483452796936 min, execution result 0\n" + "23:31:45 INFO - Running locally\n", + "23:31:45 INFO - exact dedup params are {'hash_cpu': 0.5, 'num_hashes': 2, 'doc_column': 'contents'}\n", + "23:31:45 INFO - data factory data_ is using local data access: input_folder - test-data/parquet_input output_folder - test-data/ededup_out\n", + "23:31:45 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:31:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:31:45 INFO - pipeline id pipeline_id\n", + "23:31:45 INFO - code location None\n", + "23:31:45 INFO - number of workers 3 worker options {'num_cpus': 0.8}\n", + "23:31:45 INFO - actor creation delay 0\n", + "23:31:45 INFO - job details {'job category': 'preprocessing', 'job name': 'ededup', 'job type': 'ray', 'job id': 'job_id'}\n", + "2024-06-19 23:31:47,958\tINFO worker.py:1715 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32m127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=52070)\u001b[0m 23:31:48 INFO - orchestrator started at 2024-06-19 23:31:48\n", + "\u001b[36m(orchestrate pid=52070)\u001b[0m 23:31:48 INFO - Number of files is 5, source profile {'max_file_size': 7.679162979125977, 'min_file_size': 0.010193824768066406, 'total_file_size': 8.804475784301758}\n", + "\u001b[36m(orchestrate pid=52070)\u001b[0m 23:31:48 INFO - Cluster resources: {'cpus': 16, 'gpus': 0, 'memory': 30.716969300061464, 'object_store': 2.0}\n", + "\u001b[36m(orchestrate pid=52070)\u001b[0m 23:31:48 INFO - Number of workers - 3 with {'num_cpus': 0.8} each\n", + "\u001b[36m(orchestrate pid=52070)\u001b[0m 23:31:49 INFO - Completed 1 files in 0.011938178539276123 min\n", + "\u001b[36m(orchestrate pid=52070)\u001b[0m 23:31:49 INFO - Completed 2 files in 0.011993948618570964 min\n", + "\u001b[36m(orchestrate pid=52070)\u001b[0m 23:31:49 INFO - Completed 2 files (40.0%) in 0.011996277173360189 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=52070)\u001b[0m 23:31:49 INFO - Completed processing 5 files in 0.01667089859644572 min\n", + "\u001b[36m(orchestrate pid=52070)\u001b[0m 23:31:49 INFO - done flushing in 0.0008389949798583984 sec\n", + "23:31:59 INFO - Completed execution in 0.2331868847211202 min, execution result 0\n" ] }, { @@ -462,14 +455,14 @@ "0" ] }, - "execution_count": 20, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Import ededup transform configuration\n", - "from ededup_transform import EdedupRayTransformConfiguration\n", + "from ededup_transform_ray import EdedupRayTransformConfiguration\n", "\n", "\n", "# Prepare the commandline params\n", @@ -495,10 +488,9 @@ "# Pass the commandline params\n", "sys.argv = ParamsUtils.dict_to_req(d=params)\n", "\n", - "# launch\n", - "\n", "# create launcher\n", "ededup_launcher = RayTransformLauncher(EdedupRayTransformConfiguration())\n", + "# launch\n", "ededup_launcher.launch()" ] }, @@ -517,7 +509,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 7, "id": "e6f62394-fbde-495c-bbbb-83161b006bed", "metadata": {}, "outputs": [ @@ -543,7 +535,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 8, "id": "a6daf36d-686c-4e0a-aabf-ce55f999bb2d", "metadata": {}, "outputs": [ @@ -551,25 +543,27 @@ "name": "stderr", "output_type": "stream", "text": [ - "15:49:54 INFO - Running locally\n", - "15:49:54 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'hash_column', 'int_column': 'int_id_column'}\n", - "15:49:54 INFO - data factory data_ is using local data access: input_folder - test-data/ededup_out output_folder - test-data/doc_id_out\n", - "15:49:54 INFO - data factory data_ max_files -1, n_sample -1\n", - "15:49:54 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet']\n", - "15:49:54 INFO - number of workers 3 worker options {'num_cpus': 0.8}\n", - "15:49:54 INFO - pipeline id pipeline_id; number workers 3\n", - "15:49:54 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_id', 'job type': 'ray', 'job id': 'job_id'}\n", - "15:49:54 INFO - code location None\n", - "15:49:54 INFO - actor creation delay 0\n", - "2024-05-14 15:49:59,504\tINFO worker.py:1715 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=53696)\u001b[0m 15:50:02 INFO - orchestrator started at 2024-05-14 15:50:02\n", - "\u001b[36m(orchestrate pid=53696)\u001b[0m 15:50:02 INFO - Number of files is 1, source profile {'max_file_size': 0.4407463073730469, 'min_file_size': 0.4407463073730469, 'total_file_size': 0.4407463073730469}\n", - "\u001b[36m(orchestrate pid=53696)\u001b[0m 15:50:02 INFO - Cluster resources: {'cpus': 16, 'gpus': 0, 'memory': 39.2768947603181, 'object_store': 2.0}\n", - "\u001b[36m(orchestrate pid=53696)\u001b[0m 15:50:02 INFO - Number of workers - 3 with {'num_cpus': 0.8} each\n", - "\u001b[36m(orchestrate pid=53696)\u001b[0m 15:50:02 INFO - Completed 0 files in 9.282430013020833e-06 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=53696)\u001b[0m 15:50:04 INFO - Completed processing in 0.038767898082733156 min\n", - "\u001b[36m(orchestrate pid=53696)\u001b[0m 15:50:04 INFO - done flushing in 0.003125905990600586 sec\n", - "15:50:14 INFO - Completed execution in 0.3281435489654541 min, execution result 0\n" + "23:32:01 INFO - Running locally\n", + "23:32:01 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'hash_column', 'int_column': 'int_id_column'}\n", + "23:32:01 INFO - data factory data_ is using local data access: input_folder - test-data/ededup_out output_folder - test-data/doc_id_out\n", + "23:32:01 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:32:01 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:32:01 INFO - pipeline id pipeline_id\n", + "23:32:01 INFO - code location None\n", + "23:32:01 INFO - number of workers 3 worker options {'num_cpus': 0.8}\n", + "23:32:01 INFO - actor creation delay 0\n", + "23:32:01 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_id', 'job type': 'ray', 'job id': 'job_id'}\n", + "2024-06-19 23:32:03,187\tINFO worker.py:1715 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32m127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=52123)\u001b[0m 23:32:03 INFO - orchestrator started at 2024-06-19 23:32:03\n", + "\u001b[36m(orchestrate pid=52123)\u001b[0m 23:32:03 INFO - Number of files is 5, source profile {'max_file_size': 7.648163795471191, 'min_file_size': 0.004547119140625, 'total_file_size': 8.755243301391602}\n", + "\u001b[36m(orchestrate pid=52123)\u001b[0m 23:32:03 INFO - Cluster resources: {'cpus': 16, 'gpus': 0, 'memory': 31.14116516150534, 'object_store': 2.0}\n", + "\u001b[36m(orchestrate pid=52123)\u001b[0m 23:32:03 INFO - Number of workers - 3 with {'num_cpus': 0.8} each\n", + "\u001b[36m(orchestrate pid=52123)\u001b[0m 23:32:04 INFO - Completed 1 files in 0.012701082229614257 min\n", + "\u001b[36m(orchestrate pid=52123)\u001b[0m 23:32:04 INFO - Completed 2 files in 0.012730999787648519 min\n", + "\u001b[36m(orchestrate pid=52123)\u001b[0m 23:32:04 INFO - Completed 2 files (40.0%) in 0.012731997172037761 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=52123)\u001b[0m 23:32:04 INFO - Completed processing 5 files in 0.013421499729156494 min\n", + "\u001b[36m(orchestrate pid=52123)\u001b[0m 23:32:04 INFO - done flushing in 0.0007081031799316406 sec\n", + "23:32:14 INFO - Completed execution in 0.22250189781188964 min, execution result 0\n" ] }, { @@ -578,13 +572,13 @@ "0" ] }, - "execution_count": 23, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from doc_id_transform import DocIDRayTransformConfiguration\n", + "from doc_id_transform_ray import DocIDRayTransformConfiguration\n", "local_conf = {\n", " \"input_folder\": input_folder,\n", " \"output_folder\": output_folder,\n", @@ -608,7 +602,7 @@ "# launch\n", "\n", "launcher = RayTransformLauncher(DocIDRayTransformConfiguration())\n", - "launcher.launch()\n" + "launcher.launch()" ] }, { @@ -635,7 +629,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 9, "id": "9e431c8c-c7c7-48de-ba5f-2c4649c35399", "metadata": {}, "outputs": [ @@ -668,7 +662,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 10, "id": "3864ff77-e9a8-48f7-973b-c3b3aef1a94f", "metadata": {}, "outputs": [ @@ -676,48 +670,53 @@ "name": "stderr", "output_type": "stream", "text": [ - "15:51:26 INFO - Running locally\n", - "15:51:26 INFO - fuzzy dedup params are {'doc_column': 'contents', 'id_column': 'int_id_column', 'cluster_column': 'hash_column', 'bucket_cpu': 0.5, 'mhash_cpu': 0.5, 'doc_cpu': 0.5, 'num_doc_actors': 2, 'num_minhash_actors': 1, 'num_bucket_actors': 1, 'num_preprocessors': 2, 'num_permutations': 64, 'threshold': 0.8, 'shingles_size': 5, 'delimiters': ' ', 'snapshot_delay': 1, 'use_bucket_snapshot': False, 'use_doc_snapshot': False, 'random_delay_limit': 10, 'worker_options': {'num_cpus': 0.8}}\n", - "15:51:26 INFO - data factory data_ is using local data access: input_folder - test-data/doc_id_out output_folder - test-data/fdedup_out\n", - "15:51:26 INFO - data factory data_ max_files -1, n_sample -1\n", - "15:51:26 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet']\n", - "15:51:26 INFO - number of workers 3 worker options {'num_cpus': 0.8}\n", - "15:51:26 INFO - pipeline id pipeline_id; number workers 3\n", - "15:51:26 INFO - job details {'job category': 'preprocessing', 'job name': 'fdedup', 'job type': 'ray', 'job id': 'job_id'}\n", - "15:51:26 INFO - code location None\n", - "15:51:26 INFO - actor creation delay 0\n", - "2024-05-14 15:51:30,513\tINFO worker.py:1715 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:33 INFO - orchestrator started at 2024-05-14 15:51:33\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:33 INFO - Number of files is 1, source profile {'max_file_size': 0.4658203125, 'min_file_size': 0.4658203125, 'total_file_size': 0.4658203125}\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:33 INFO - Cluster resources: {'cpus': 16, 'gpus': 0, 'memory': 39.44537696894258, 'object_store': 2.0}\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:33 INFO - Number of workers - 3 with {'num_cpus': 0.8} each\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:33 INFO - starting run from the beginning\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:33 INFO - continuing from the very beginning\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:33 INFO - Fuzzy: num buckets 5, bucket length 11\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:33 INFO - created 1 bucket actors\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:33 INFO - created 1 minhash actors\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:33 INFO - Table preprocessing uses 1 readers\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:33 INFO - created 1 table processor actors\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:33 INFO - Completed 0 files in 9.632110595703125e-06 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:44 INFO - Completed processing in 0.17843671639760336 min\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:44 INFO - creating minhash snapshots\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:45 INFO - minhash snapshots created\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:45 INFO - creating bucket snapshots\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:46 INFO - bucket snapshots created\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:46 INFO - created 2 document actors\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:46 INFO - created 1 bucket processor actors\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:46 INFO - created bucket processor invoker\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:46 INFO - added invoker to bucket collectors\n", - "\u001b[36m(BucketsHash pid=53787)\u001b[0m 15:51:46 INFO - processing buckets 0 long, 1695 short\n", - "\u001b[36m(BucketsHash pid=53787)\u001b[0m 15:51:46 INFO - Done submitting long buckets\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:48 INFO - Done processing buckets in 0.027178847789764406 min\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:48 INFO - creating document snapshots\n", - "\u001b[36m(BucketsHashProcessorInvoker pid=53795)\u001b[0m 15:51:48 INFO - Waiting bucket processing completion\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:50 INFO - document snapshots created\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:50 INFO - Completed 0 files in 8.519490559895834e-06 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:58 INFO - Completed processing in 0.13367428382237753 min\n", - "\u001b[36m(orchestrate pid=53772)\u001b[0m 15:51:58 INFO - done flushing in 0.0031740665435791016 sec\n", - "15:52:08 INFO - Completed execution in 0.6925825993220012 min, execution result 0\n" + "23:32:16 INFO - Running locally\n", + "23:32:16 INFO - fuzzy dedup params are {'doc_column': 'contents', 'id_column': 'int_id_column', 'cluster_column': 'hash_column', 'bucket_cpu': 0.5, 'mhash_cpu': 0.5, 'doc_cpu': 0.5, 'num_doc_actors': 2, 'num_minhash_actors': 1, 'num_bucket_actors': 1, 'num_preprocessors': 2, 'num_permutations': 64, 'threshold': 0.8, 'shingles_size': 5, 'delimiters': ' ', 'snapshot_delay': 1, 'use_bucket_snapshot': False, 'use_doc_snapshot': False, 'random_delay_limit': 10, 'worker_options': {'num_cpus': 0.8}}\n", + "23:32:16 INFO - data factory data_ is using local data access: input_folder - test-data/doc_id_out output_folder - test-data/fdedup_out\n", + "23:32:16 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:32:16 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:32:16 INFO - pipeline id pipeline_id\n", + "23:32:16 INFO - code location None\n", + "23:32:16 INFO - number of workers 3 worker options {'num_cpus': 0.8}\n", + "23:32:16 INFO - actor creation delay 0\n", + "23:32:16 INFO - job details {'job category': 'preprocessing', 'job name': 'fdedup', 'job type': 'ray', 'job id': 'job_id'}\n", + "2024-06-19 23:32:18,484\tINFO worker.py:1715 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32m127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:19 INFO - orchestrator started at 2024-06-19 23:32:19\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:19 INFO - Number of files is 5, source profile {'max_file_size': 7.7561540603637695, 'min_file_size': 0.005639076232910156, 'total_file_size': 8.945060729980469}\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:19 INFO - Cluster resources: {'cpus': 16, 'gpus': 0, 'memory': 31.387203980237246, 'object_store': 2.0}\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:19 INFO - Number of workers - 3 with {'num_cpus': 0.8} each\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:19 INFO - starting run from the beginning\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:19 INFO - continuing from the very beginning\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:19 INFO - Fuzzy: num buckets 5, bucket length 11\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:19 INFO - created 1 bucket actors\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:19 INFO - created 1 minhash actors\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:19 INFO - Table preprocessing uses 2 readers\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:19 INFO - created 2 table processor actors\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:25 INFO - Completed 1 files in 0.09585129817326864 min\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:26 INFO - Completed 2 files in 0.11462793350219727 min\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:29 INFO - Completed 3 files in 0.1648825526237488 min\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:29 INFO - Completed 3 files (60.0%) in 0.16489086548487344 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:35 INFO - Completed processing 5 files in 0.2676188151041667 min\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:35 INFO - creating minhash snapshots\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:36 INFO - minhash snapshots created\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:36 INFO - creating bucket snapshots\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:37 INFO - bucket snapshots created\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:37 INFO - created 2 document actors\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:37 INFO - created 2 bucket processor actors\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:37 INFO - created bucket processor invoker\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:37 INFO - added invoker to bucket collectors\n", + "\u001b[36m(BucketsHash pid=52174)\u001b[0m 23:32:37 INFO - processing buckets 0 long, 13235 short\n", + "\u001b[36m(BucketsHash pid=52174)\u001b[0m 23:32:37 INFO - Done submitting long buckets\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:38 INFO - Done processing buckets in 0.01374209721883138 min\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:38 INFO - creating document snapshots\n", + "\u001b[36m(BucketsHashProcessorInvoker pid=52204)\u001b[0m 23:32:38 INFO - Waiting bucket processing completion\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:40 INFO - document snapshots created\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:43 INFO - Completed 1 files in 0.04649122953414917 min\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:45 INFO - Completed 2 files in 0.0795912504196167 min\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:45 INFO - Completed 2 files (40.0%) in 0.0795974850654602 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:54 INFO - Completed processing 5 files in 0.23002409934997559 min\n", + "\u001b[36m(orchestrate pid=52167)\u001b[0m 23:32:54 INFO - done flushing in 0.004540920257568359 sec\n", + "23:33:04 INFO - Completed execution in 0.791690719127655 min, execution result 0\n" ] }, { @@ -726,7 +725,7 @@ "0" ] }, - "execution_count": 25, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -736,7 +735,7 @@ "import sys\n", "\n", "from data_processing.utils import ParamsUtils\n", - "from fdedup_transform import FdedupRayTransformConfiguration\n", + "from fdedup_transform_ray import FdedupRayTransformConfiguration\n", "\n", "# create parameters\n", "\n", @@ -802,7 +801,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 11, "id": "a8ec4fb6-fa62-45d1-9aa1-596d7182b2c9", "metadata": {}, "outputs": [], @@ -823,7 +822,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 12, "id": "48dbb2a3-a6f4-4a3d-bb2f-8491fd063611", "metadata": {}, "outputs": [ @@ -831,31 +830,35 @@ "name": "stderr", "output_type": "stream", "text": [ - "16:01:12 INFO - Running locally\n", - "16:01:12 INFO - data factory proglang_select_ is using local configuration without input/output path\n", - "16:01:12 INFO - data factory proglang_select_ max_files -1, n_sample -1\n", - "16:01:12 INFO - data factory proglang_select_ Not using data sets, checkpointing None, max files -1, random samples -1, files to use None\n", - "16:01:12 INFO - data factory data_ is using local data access: input_folder - test-data/fdedup_out output_folder - test-data/lang_out\n", - "16:01:12 INFO - data factory data_ max_files -1, n_sample -1\n", - "16:01:12 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet']\n", - "16:01:12 INFO - number of workers 1 worker options {'num_cpus': 0.8}\n", - "16:01:12 INFO - pipeline id pipeline_id; number workers 1\n", - "16:01:12 INFO - job details {'job category': 'preprocessing', 'job name': 'proglang_select', 'job type': 'ray', 'job id': 'job_id'}\n", - "16:01:12 INFO - code location None\n", - "16:01:12 INFO - actor creation delay 0\n", - "2024-05-14 16:01:16,116\tINFO worker.py:1715 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=54089)\u001b[0m 16:01:18 INFO - orchestrator started at 2024-05-14 16:01:18\n", - "\u001b[36m(orchestrate pid=54089)\u001b[0m 16:01:18 INFO - Number of files is 1, source profile {'max_file_size': 0.44320201873779297, 'min_file_size': 0.44320201873779297, 'total_file_size': 0.44320201873779297}\n", - "\u001b[36m(orchestrate pid=54089)\u001b[0m 16:01:18 INFO - Cluster resources: {'cpus': 16, 'gpus': 0, 'memory': 38.956433868967, 'object_store': 2.0}\n", - "\u001b[36m(orchestrate pid=54089)\u001b[0m 16:01:18 INFO - Number of workers - 1 with {'num_cpus': 0.8} each\n", - "\u001b[36m(orchestrate pid=54089)\u001b[0m 16:01:18 INFO - Getting supported languages from file ./test-data/allowed-code-languages.txt\n", - "\u001b[36m(orchestrate pid=54089)\u001b[0m 16:01:18 INFO - Supported languages ['Java', 'C', 'Go', 'Text', 'Python', 'Markdown']\n", - "\u001b[36m(orchestrate pid=54089)\u001b[0m 16:01:18 INFO - Placed language list into Ray object storage under referenceObjectRef(001b78d7db9a6593ffffffffffffffffffffffff0100000002e1f505)\n", - "\u001b[36m(orchestrate pid=54089)\u001b[0m 16:01:18 INFO - Completed 0 files in 7.2161356608072914e-06 min. Waiting for completion\n", - "\u001b[36m(TransformTableProcessorRay pid=54103)\u001b[0m 16:01:20 INFO - Loading languages to include from Ray storage under reference ObjectRef(001b78d7db9a6593ffffffffffffffffffffffff0100000002e1f505)\n", - "\u001b[36m(orchestrate pid=54089)\u001b[0m 16:01:20 INFO - Completed processing in 0.028798997402191162 min\n", - "\u001b[36m(orchestrate pid=54089)\u001b[0m 16:01:20 INFO - done flushing in 0.0009629726409912109 sec\n", - "16:01:30 INFO - Completed execution in 0.30742111603418987 min, execution result 0\n" + "23:33:05 INFO - Running locally\n", + "23:33:05 INFO - data factory proglang_select_ is using local configuration without input/output path\n", + "23:33:05 INFO - data factory proglang_select_ max_files -1, n_sample -1\n", + "23:33:05 INFO - data factory proglang_select_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:33:05 INFO - data factory data_ is using local data access: input_folder - test-data/fdedup_out output_folder - test-data/lang_out\n", + "23:33:05 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:33:05 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:33:05 INFO - pipeline id pipeline_id\n", + "23:33:05 INFO - code location None\n", + "23:33:05 INFO - number of workers 1 worker options {'num_cpus': 0.8}\n", + "23:33:05 INFO - actor creation delay 0\n", + "23:33:05 INFO - job details {'job category': 'preprocessing', 'job name': 'proglang_select', 'job type': 'ray', 'job id': 'job_id'}\n", + "2024-06-19 23:33:07,489\tINFO worker.py:1715 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32m127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=52238)\u001b[0m 23:33:08 INFO - orchestrator started at 2024-06-19 23:33:08\n", + "\u001b[36m(orchestrate pid=52238)\u001b[0m 23:33:08 INFO - Number of files is 5, source profile {'max_file_size': 7.656813621520996, 'min_file_size': 0.005321502685546875, 'total_file_size': 8.772536277770996}\n", + "\u001b[36m(orchestrate pid=52238)\u001b[0m 23:33:08 INFO - Cluster resources: {'cpus': 16, 'gpus': 0, 'memory': 31.42457122821361, 'object_store': 2.0}\n", + "\u001b[36m(orchestrate pid=52238)\u001b[0m 23:33:08 INFO - Number of workers - 1 with {'num_cpus': 0.8} each\n", + "\u001b[36m(orchestrate pid=52238)\u001b[0m 23:33:08 INFO - Getting supported languages from file ./test-data/allowed-code-languages.txt\n", + "\u001b[36m(orchestrate pid=52238)\u001b[0m 23:33:08 INFO - Supported languages b'Java\\nC\\nGo\\nText\\nPython\\nMarkdown'\n", + "\u001b[36m(orchestrate pid=52238)\u001b[0m 23:33:08 INFO - Placed language list into Ray object storage under referenceObjectRef(00d950ec0ccf9d2affffffffffffffffffffffff0100000002e1f505)\n", + "\u001b[36m(RayTransformFileProcessor pid=52252)\u001b[0m 23:33:08 INFO - Loading languages to include from Ray storage under reference ObjectRef(00d950ec0ccf9d2affffffffffffffffffffffff0100000002e1f505)\n", + "\u001b[36m(orchestrate pid=52238)\u001b[0m 23:33:08 INFO - Completed 1 files in 0.009732401371002198 min\n", + "\u001b[36m(orchestrate pid=52238)\u001b[0m 23:33:08 INFO - Completed 2 files in 0.009788183371225993 min\n", + "\u001b[36m(orchestrate pid=52238)\u001b[0m 23:33:08 INFO - Completed 3 files in 0.010748851299285888 min\n", + "\u001b[36m(orchestrate pid=52238)\u001b[0m 23:33:08 INFO - Completed 4 files in 0.010778764883677164 min\n", + "\u001b[36m(orchestrate pid=52238)\u001b[0m 23:33:08 INFO - Completed 4 files (80.0%) in 0.010779603322347005 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=52238)\u001b[0m 23:33:08 INFO - Completed processing 5 files in 0.010830299059549967 min\n", + "\u001b[36m(orchestrate pid=52238)\u001b[0m 23:33:08 INFO - done flushing in 0.0004150867462158203 sec\n", + "23:33:18 INFO - Completed execution in 0.22255444924036663 min, execution result 0\n" ] }, { @@ -864,7 +867,7 @@ "0" ] }, - "execution_count": 31, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -874,7 +877,7 @@ "import sys\n", "\n", "from data_processing.utils import ParamsUtils\n", - "from proglang_select_transform import (\n", + "from proglang_select_transform_ray import (\n", " ProgLangSelectRayConfiguration,\n", " lang_allowed_langs_file_key,\n", " lang_lang_column_key,\n", @@ -936,7 +939,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 13, "id": "9f080011-c9fe-430e-9ecc-f2220d2c8d18", "metadata": {}, "outputs": [ @@ -967,7 +970,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 14, "id": "29319fb9-b0d8-4f86-9bc5-b92960ad8ae5", "metadata": {}, "outputs": [ @@ -975,31 +978,35 @@ "name": "stderr", "output_type": "stream", "text": [ - "16:02:42 INFO - Running locally\n", - "16:02:42 INFO - data factory data_ is using local data access: input_folder - test-data/lang_out output_folder - test-data/cq_out\n", - "16:02:42 INFO - data factory data_ max_files -1, n_sample -1\n", - "16:02:42 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet']\n", - "16:02:42 INFO - number of workers 3 worker options {'num_cpus': 0.8}\n", - "16:02:42 INFO - pipeline id pipeline_id; number workers 3\n", - "16:02:42 INFO - job details {'job category': 'preprocessing', 'job name': 'code_quality', 'job type': 'ray', 'job id': 'job_id'}\n", - "16:02:42 INFO - code location None\n", - "16:02:42 INFO - actor creation delay 0\n", - "2024-05-14 16:02:46,129\tINFO worker.py:1715 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=54160)\u001b[0m None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n", - "\u001b[36m(orchestrate pid=54160)\u001b[0m 16:02:49 INFO - orchestrator started at 2024-05-14 16:02:49\n", - "\u001b[36m(orchestrate pid=54160)\u001b[0m 16:02:49 INFO - Number of files is 1, source profile {'max_file_size': 0.4435100555419922, 'min_file_size': 0.4435100555419922, 'total_file_size': 0.4435100555419922}\n", - "\u001b[36m(orchestrate pid=54160)\u001b[0m 16:02:49 INFO - Cluster resources: {'cpus': 16, 'gpus': 0, 'memory': 39.04623680189252, 'object_store': 2.0}\n", - "\u001b[36m(orchestrate pid=54160)\u001b[0m 16:02:49 INFO - Number of workers - 3 with {'num_cpus': 0.8} each\n", - "\u001b[36m(orchestrate pid=54160)\u001b[0m 16:02:49 INFO - Completed 0 files in 7.502237955729166e-06 min. Waiting for completion\n", - "\u001b[36m(TransformTableProcessorRay pid=54172)\u001b[0m /Users/himapatel/Work/Projects/MCD/OpenSource/Demo/data-prep-kit/examples/venv/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", - "\u001b[36m(TransformTableProcessorRay pid=54172)\u001b[0m warnings.warn(\n", - "\u001b[36m(TransformTableProcessorRay pid=54174)\u001b[0m Token indices sequence length is longer than the specified maximum sequence length for this model (1676 > 1024). Running this sequence through the model will result in indexing errors\n", - "\u001b[36m(orchestrate pid=54160)\u001b[0m 16:02:53 INFO - Completed processing in 0.07286310195922852 min\n", - "\u001b[36m(orchestrate pid=54160)\u001b[0m 16:02:53 INFO - done flushing in 0.0032660961151123047 sec\n", - "16:03:03 INFO - Completed execution in 0.3572404503822327 min, execution result 0\n", - "\u001b[36m(TransformTableProcessorRay pid=54173)\u001b[0m None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\u001b[32m [repeated 3x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)\u001b[0m\n", - "\u001b[36m(TransformTableProcessorRay pid=54174)\u001b[0m /Users/himapatel/Work/Projects/MCD/OpenSource/Demo/data-prep-kit/examples/venv/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\u001b[32m [repeated 2x across cluster]\u001b[0m\n", - "\u001b[36m(TransformTableProcessorRay pid=54174)\u001b[0m warnings.warn(\u001b[32m [repeated 2x across cluster]\u001b[0m\n" + "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n", + "23:33:20 INFO - Running locally\n", + "23:33:20 INFO - data factory data_ is using local data access: input_folder - test-data/lang_out output_folder - test-data/cq_out\n", + "23:33:20 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:33:20 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:33:20 INFO - pipeline id pipeline_id\n", + "23:33:20 INFO - code location None\n", + "23:33:20 INFO - number of workers 3 worker options {'num_cpus': 0.8}\n", + "23:33:20 INFO - actor creation delay 0\n", + "23:33:20 INFO - job details {'job category': 'preprocessing', 'job name': 'code_quality', 'job type': 'ray', 'job id': 'job_id'}\n", + "2024-06-19 23:33:22,278\tINFO worker.py:1715 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32m127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=52278)\u001b[0m None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n", + "\u001b[36m(orchestrate pid=52278)\u001b[0m 23:33:23 INFO - orchestrator started at 2024-06-19 23:33:23\n", + "\u001b[36m(orchestrate pid=52278)\u001b[0m 23:33:23 INFO - Number of files is 5, source profile {'max_file_size': 7.65715217590332, 'min_file_size': 0.005566596984863281, 'total_file_size': 8.77397632598877}\n", + "\u001b[36m(orchestrate pid=52278)\u001b[0m 23:33:23 INFO - Cluster resources: {'cpus': 16, 'gpus': 0, 'memory': 30.955674744211137, 'object_store': 2.0}\n", + "\u001b[36m(orchestrate pid=52278)\u001b[0m 23:33:23 INFO - Number of workers - 3 with {'num_cpus': 0.8} each\n", + "\u001b[36m(RayTransformFileProcessor pid=52295)\u001b[0m /Users/shivdeep/workspace/projects/current/oss-data-prep/test-ingest2parquet/data-prep-kit/examples/venv/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + "\u001b[36m(RayTransformFileProcessor pid=52295)\u001b[0m warnings.warn(\n", + "\u001b[36m(RayTransformFileProcessor pid=52295)\u001b[0m Token indices sequence length is longer than the specified maximum sequence length for this model (1676 > 1024). Running this sequence through the model will result in indexing errors\n", + "\u001b[36m(orchestrate pid=52278)\u001b[0m 23:33:24 INFO - Completed 1 files in 0.02450234889984131 min\n", + "\u001b[36m(orchestrate pid=52278)\u001b[0m 23:33:24 INFO - Completed 2 files in 0.024543317159016927 min\n", + "\u001b[36m(orchestrate pid=52278)\u001b[0m 23:33:24 INFO - Completed 2 files (40.0%) in 0.024544183413187662 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=52278)\u001b[0m 23:33:36 INFO - Completed processing 5 files in 0.2136624813079834 min\n", + "\u001b[36m(orchestrate pid=52278)\u001b[0m 23:33:36 INFO - done flushing in 0.005311012268066406 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=52294)\u001b[0m None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\u001b[32m [repeated 3x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)\u001b[0m\n", + "\u001b[36m(RayTransformFileProcessor pid=52294)\u001b[0m /Users/shivdeep/workspace/projects/current/oss-data-prep/test-ingest2parquet/data-prep-kit/examples/venv/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\u001b[32m [repeated 2x across cluster]\u001b[0m\n", + "\u001b[36m(RayTransformFileProcessor pid=52294)\u001b[0m warnings.warn(\u001b[32m [repeated 2x across cluster]\u001b[0m\n", + "\u001b[36m(RayTransformFileProcessor pid=52294)\u001b[0m Token indices sequence length is longer than the specified maximum sequence length for this model (2389 > 1024). Running this sequence through the model will result in indexing errors\u001b[32m [repeated 2x across cluster]\u001b[0m\n", + "23:33:46 INFO - Completed execution in 0.4255295991897583 min, execution result 0\n" ] }, { @@ -1008,7 +1015,7 @@ "0" ] }, - "execution_count": 35, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1018,7 +1025,7 @@ "import sys\n", "from pathlib import Path\n", "\n", - "from code_quality_transform import CodeQualityRayTransformConfiguration\n", + "from code_quality_transform_ray import CodeQualityRayTransformConfiguration\n", "from data_processing.utils import ParamsUtils\n", "\n", "local_conf = {\n", @@ -1050,9 +1057,9 @@ "\n", "sys.argv = ParamsUtils.dict_to_req(d=params)\n", "\n", - "# launch\n", "# create launcher\n", "launcher = RayTransformLauncher(CodeQualityRayTransformConfiguration())\n", + "# launch\n", "launcher.launch()" ] }, @@ -1077,7 +1084,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 15, "id": "a7991811-b19e-43b5-89ac-b24060c0ccfa", "metadata": {}, "outputs": [], @@ -1096,7 +1103,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 16, "id": "61dea2b0-0e54-4912-8620-886e2b8420ef", "metadata": {}, "outputs": [ @@ -1104,24 +1111,24 @@ "name": "stderr", "output_type": "stream", "text": [ - "16:04:48 INFO - Running locally\n", - "16:04:48 INFO - data factory data_ is using local data access: input_folder - test-data/cq_out output_folder - test-data/filter_out\n", - "16:04:48 INFO - data factory data_ max_files -1, n_sample -1\n", - "16:04:48 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet']\n", - "16:04:48 INFO - number of workers 5 worker options {'num_cpus': 0.8}\n", - "16:04:48 INFO - pipeline id pipeline_id; number workers 5\n", - "16:04:48 INFO - job details {'job category': 'preprocessing', 'job name': 'filter', 'job type': 'ray', 'job id': 'job_id'}\n", - "16:04:48 INFO - code location None\n", - "16:04:48 INFO - actor creation delay 0\n", - "2024-05-14 16:04:51,445\tINFO worker.py:1715 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=54239)\u001b[0m 16:04:54 INFO - orchestrator started at 2024-05-14 16:04:54\n", - "\u001b[36m(orchestrate pid=54239)\u001b[0m 16:04:54 INFO - Number of files is 1, source profile {'max_file_size': 0.4591951370239258, 'min_file_size': 0.4591951370239258, 'total_file_size': 0.4591951370239258}\n", - "\u001b[36m(orchestrate pid=54239)\u001b[0m 16:04:54 INFO - Cluster resources: {'cpus': 16, 'gpus': 0, 'memory': 39.04187660291791, 'object_store': 2.0}\n", - "\u001b[36m(orchestrate pid=54239)\u001b[0m 16:04:54 INFO - Number of workers - 5 with {'num_cpus': 0.8} each\n", - "\u001b[36m(orchestrate pid=54239)\u001b[0m 16:04:54 INFO - Completed 0 files in 7.3552131652832035e-06 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=54239)\u001b[0m 16:04:56 INFO - Completed processing in 0.03446693817774455 min\n", - "\u001b[36m(orchestrate pid=54239)\u001b[0m 16:04:56 INFO - done flushing in 0.0033478736877441406 sec\n", - "16:05:06 INFO - Completed execution in 0.29624868631362916 min, execution result 0\n" + "23:33:47 INFO - Running locally\n", + "23:33:47 INFO - data factory data_ is using local data access: input_folder - test-data/cq_out output_folder - test-data/filter_out\n", + "23:33:47 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:33:47 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:33:47 INFO - pipeline id pipeline_id\n", + "23:33:47 INFO - code location None\n", + "23:33:47 INFO - number of workers 5 worker options {'num_cpus': 0.8}\n", + "23:33:47 INFO - actor creation delay 0\n", + "23:33:47 INFO - job details {'job category': 'preprocessing', 'job name': 'filter', 'job type': 'ray', 'job id': 'job_id'}\n", + "2024-06-19 23:33:49,730\tINFO worker.py:1715 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32m127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=52332)\u001b[0m 23:33:50 INFO - orchestrator started at 2024-06-19 23:33:50\n", + "\u001b[36m(orchestrate pid=52332)\u001b[0m 23:33:50 INFO - Number of files is 5, source profile {'max_file_size': 7.711193084716797, 'min_file_size': 0.009368896484375, 'total_file_size': 8.878622055053711}\n", + "\u001b[36m(orchestrate pid=52332)\u001b[0m 23:33:50 INFO - Cluster resources: {'cpus': 16, 'gpus': 0, 'memory': 31.283493041992188, 'object_store': 2.0}\n", + "\u001b[36m(orchestrate pid=52332)\u001b[0m 23:33:50 INFO - Number of workers - 5 with {'num_cpus': 0.8} each\n", + "\u001b[36m(orchestrate pid=52332)\u001b[0m 23:33:50 INFO - Completed 0 files (0.0%) in 4.8319498697916664e-06 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=52332)\u001b[0m 23:33:51 INFO - Completed processing 5 files in 0.013345913092295329 min\n", + "\u001b[36m(orchestrate pid=52332)\u001b[0m 23:33:51 INFO - done flushing in 0.0010788440704345703 sec\n", + "23:34:01 INFO - Completed execution in 0.22851296265920004 min, execution result 0\n" ] }, { @@ -1130,7 +1137,7 @@ "0" ] }, - "execution_count": 38, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1140,11 +1147,11 @@ "\n", "from data_processing.data_access import DataAccessLocal\n", "from filter_transform import (\n", - " FilterRayTransformConfiguration,\n", " filter_columns_to_drop_cli_param,\n", " filter_criteria_cli_param,\n", " filter_logical_operator_cli_param,\n", ")\n", + "from filter_transform_ray import FilterRayTransformConfiguration\n", "\n", "local_conf = {\n", " \"input_folder\": input_folder,\n", @@ -1178,9 +1185,9 @@ "\n", "\n", "sys.argv = ParamsUtils.dict_to_req(launcher_params | filter_params)\n", - " # Create the longer to launch with the blocklist transform.\n", - "launcher =RayTransformLauncher(FilterRayTransformConfiguration())\n", - " # Launch the ray actor(s) to process the input\n", + "# Create the longer to launch with the blocklist transform.\n", + "launcher = RayTransformLauncher(FilterRayTransformConfiguration())\n", + "# Launch the ray actor(s) to process the input\n", "launcher.launch()" ] }, @@ -1198,7 +1205,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 17, "id": "20a153fa-fd56-401e-86be-4f7617affcc8", "metadata": {}, "outputs": [], @@ -1209,7 +1216,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "228df6b2-bc62-494b-9697-03ece98d7853", "metadata": {}, "outputs": [ @@ -1217,21 +1224,47 @@ "name": "stderr", "output_type": "stream", "text": [ - "16:07:10 INFO - Running locally\n", - "16:07:10 INFO - data factory data_ is using local data access: input_folder - test-data/filter_out output_folder - test-data/tokenization_out\n", - "16:07:10 INFO - data factory data_ max_files -1, n_sample -1\n", - "16:07:10 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet']\n", - "16:07:10 INFO - number of workers 5 worker options {'num_cpus': 0.8}\n", - "16:07:10 INFO - pipeline id pipeline_id; number workers 5\n", - "16:07:10 INFO - job details {'job category': 'preprocessing', 'job name': 'Tokenization', 'job type': 'ray', 'job id': 'job_id'}\n", - "16:07:10 INFO - code location None\n", - "16:07:10 INFO - actor creation delay 0\n", - "2024-05-14 16:07:14,722\tINFO worker.py:1715 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n" + "23:34:02 INFO - Running locally\n", + "23:34:02 INFO - data factory data_ is using local data access: input_folder - test-data/filter_out output_folder - test-data/tokenization_out\n", + "23:34:02 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:34:02 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:34:02 INFO - pipeline id pipeline_id\n", + "23:34:02 INFO - code location None\n", + "23:34:02 INFO - number of workers 5 worker options {'num_cpus': 0.8}\n", + "23:34:02 INFO - actor creation delay 0\n", + "23:34:02 INFO - job details {'job category': 'preprocessing', 'job name': 'Tokenization', 'job type': 'ray', 'job id': 'job_id'}\n", + "2024-06-19 23:34:04,667\tINFO worker.py:1715 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32m127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=52380)\u001b[0m None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n", + "\u001b[36m(orchestrate pid=52380)\u001b[0m 23:34:05 INFO - orchestrator started at 2024-06-19 23:34:05\n", + "\u001b[36m(orchestrate pid=52380)\u001b[0m 23:34:05 INFO - Number of files is 5, source profile {'max_file_size': 0.3004436492919922, 'min_file_size': 0.004254341125488281, 'total_file_size': 0.48981380462646484}\n", + "\u001b[36m(orchestrate pid=52380)\u001b[0m 23:34:05 INFO - Cluster resources: {'cpus': 16, 'gpus': 0, 'memory': 31.70830688532442, 'object_store': 2.0}\n", + "\u001b[36m(orchestrate pid=52380)\u001b[0m 23:34:05 INFO - Number of workers - 5 with {'num_cpus': 0.8} each\n", + "\u001b[36m(orchestrate pid=52380)\u001b[0m 23:34:05 INFO - Completed 0 files (0.0%) in 4.2994817097981775e-06 min. Waiting for completion\n", + "\u001b[36m(RayTransformFileProcessor pid=52394)\u001b[0m /Users/shivdeep/workspace/projects/current/oss-data-prep/test-ingest2parquet/data-prep-kit/examples/venv/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + "\u001b[36m(RayTransformFileProcessor pid=52394)\u001b[0m warnings.warn(\n", + "\u001b[36m(RayTransformFileProcessor pid=52396)\u001b[0m Token indices sequence length is longer than the specified maximum sequence length for this model (2344 > 2048). Running this sequence through the model will result in indexing errors\n", + "\u001b[36m(orchestrate pid=52380)\u001b[0m 23:34:07 INFO - Completed processing 5 files in 0.02799654801686605 min\n", + "\u001b[36m(orchestrate pid=52380)\u001b[0m 23:34:07 INFO - done flushing in 0.0008699893951416016 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=52395)\u001b[0m 23:34:07 WARNING - table is empty, skipping processing\n", + "23:34:17 INFO - Completed execution in 0.23911206324895223 min, execution result 0\n", + "\u001b[36m(RayTransformFileProcessor pid=52396)\u001b[0m None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\u001b[32m [repeated 5x across cluster]\u001b[0m\n", + "\u001b[36m(RayTransformFileProcessor pid=52396)\u001b[0m /Users/shivdeep/workspace/projects/current/oss-data-prep/test-ingest2parquet/data-prep-kit/examples/venv/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\u001b[32m [repeated 4x across cluster]\u001b[0m\n", + "\u001b[36m(RayTransformFileProcessor pid=52396)\u001b[0m warnings.warn(\u001b[32m [repeated 4x across cluster]\u001b[0m\n" ] + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "from tokenization_transform import TokenizationRayConfiguration\n", + "from tokenization_transform_ray import TokenizationRayConfiguration\n", "\n", "local_conf = {\n", " \"input_folder\": input_folder,\n", @@ -1254,14 +1287,6 @@ "# Launch the ray actor(s) to process the input\n", "launcher.launch()\n" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1029fecc-fcd5-47c9-afbe-c6a3a9daa558", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -1280,7 +1305,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.8" + "version": "3.10.3" } }, "nbformat": 4, diff --git a/examples/launch.sh b/examples/launch.sh index e25d734f4..c1e978b70 100644 --- a/examples/launch.sh +++ b/examples/launch.sh @@ -5,18 +5,19 @@ echo $REPO_ROOT # Set PYTHONPATH for `data_processing_ray` library export PYTHONPATH=$REPO_ROOT/data-processing-lib/python/src -export PYTHONPATH=$PYTHONPATH:$REPO_ROOT/data-processing-lib/ray/src +export PYTHONPATH=$PYTHONPATH:$REPO_ROOT/data-processing-lib/ray/src:$REPO_ROOT/data-processing-lib/python/src # Set PYTHONAPATH for transforms -export PYTHONPATH=$PYTHONPATH:$REPO_ROOT/transforms/code/malware/ray/src -export PYTHONPATH=$PYTHONPATH:$REPO_ROOT/transforms/code/code_quality/ray/src -export PYTHONPATH=$PYTHONPATH:$REPO_ROOT/transforms/code/proglang_select/ray/src -export PYTHONPATH=$PYTHONPATH:$REPO_ROOT/transforms/universal/ededup/ray/src -export PYTHONPATH=$PYTHONPATH:$REPO_ROOT/transforms/universal/fdedup/ray/src -export PYTHONPATH=$PYTHONPATH:$REPO_ROOT/transforms/universal/filter/ray/src -export PYTHONPATH=$PYTHONPATH:$REPO_ROOT/transforms/universal/doc_id/ray/src -export PYTHONPATH=$PYTHONPATH:$REPO_ROOT/transforms/universal/tokenization/ray/src +export PYTHONPATH=$PYTHONPATH:$REPO_ROOT/transforms/code/malware/ray/src:$REPO_ROOT/transforms/code/malware/python/src +export PYTHONPATH=$PYTHONPATH:$REPO_ROOT/transforms/code/code_quality/ray/src:$REPO_ROOT/transforms/code/code_quality/python/src +export PYTHONPATH=$PYTHONPATH:$REPO_ROOT/transforms/code/proglang_select/ray/src:$REPO_ROOT/transforms/code/proglang_select/python/src +export PYTHONPATH=$PYTHONPATH:$REPO_ROOT/transforms/universal/ededup/ray/src:$REPO_ROOT/transforms/universal/ededup/python/src +export PYTHONPATH=$PYTHONPATH:$REPO_ROOT/transforms/universal/fdedup/ray/src:$REPO_ROOT/transforms/universal/fdedup/python/src +export PYTHONPATH=$PYTHONPATH:$REPO_ROOT/transforms/universal/filter/ray/src:$REPO_ROOT/transforms/universal/filter/python/src +export PYTHONPATH=$PYTHONPATH:$REPO_ROOT/transforms/universal/doc_id/ray/src:$REPO_ROOT/transforms/universal/doc_id/python/src +export PYTHONPATH=$PYTHONPATH:$REPO_ROOT/transforms/universal/tokenization/ray/src:$REPO_ROOT/transforms/universal/tokenization/python/src export PYTHONPATH=$PYTHONPATH:$REPO_ROOT/tools/ingest2parquet/src/ +export PYTHONPATH=$PYTHONPATH:$REPO_ROOT/transforms/code/ingest_2_parquet/ray/src:$REPO_ROOT/transforms/code/ingest_2_parquet/python/src . ./venv/bin/activate jupyter notebook demo_with_launcher.ipynb diff --git a/examples/prepare_env.sh b/examples/prepare_env.sh index f0cf24539..fd18feb1f 100644 --- a/examples/prepare_env.sh +++ b/examples/prepare_env.sh @@ -12,10 +12,8 @@ requirement_files=( "$REPO_ROOT/transforms/code/proglang_select/ray/requirements.txt" "$REPO_ROOT/transforms/universal/ededup/ray/requirements.txt" "$REPO_ROOT/transforms/universal/fdedup/ray/requirements.txt" -"$REPO_ROOT/transforms/universal/filter/ray/requirements.txt" "$REPO_ROOT/tools/ingest2parquet/requirements.txt" "$REPO_ROOT/transforms/universal/doc_id/ray/requirements.txt" -"$REPO_ROOT/transforms/universal/tokenization/ray/requirements.txt" ) # Iterate through the list and install requirements from each file @@ -24,3 +22,5 @@ do echo "Install packages from $requirements_file" pip install -r "$requirements_file" done + +pip install duckdb