Skip to content

Commit

Permalink
Update image retrieval sample pipeline (#464)
Browse files Browse the repository at this point in the history
- Updating sample pipeline and ensure cache is disabled, set
`input_partition_rows` to 1000 to be sure we don't run in out of memory, disabled image resizing
- Remove custom component pipeline
  • Loading branch information
mrchtr authored Sep 26, 2023
1 parent df83004 commit 7575f04
Show file tree
Hide file tree
Showing 6 changed files with 11 additions and 133 deletions.

This file was deleted.

This file was deleted.

Empty file.

This file was deleted.

59 changes: 0 additions & 59 deletions examples/pipelines/filter-cc-25m/filter_pipeline.py

This file was deleted.

15 changes: 11 additions & 4 deletions examples/pipelines/filter-cc-25m/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,13 @@ def create_directory_if_not_exists(path):


PIPELINE_NAME = "cc-image-filter-pipeline"
PIPELINE_DESCRIPTION = "Load cc image dataset and reduce to PNG files"
PIPELINE_DESCRIPTION = "Load cc image dataset"
BASE_PATH = "./data"
BASE_PATH = create_directory_if_not_exists(BASE_PATH)

# Define pipeline
pipeline = Pipeline(pipeline_name=PIPELINE_NAME, base_path=BASE_PATH)


# Load from hub component
load_component_column_mapping = {
"alt_text": "images_alt+text",
Expand All @@ -41,11 +40,19 @@ def create_directory_if_not_exists(path):
"column_name_mapping": load_component_column_mapping,
"n_rows_to_load": 10000, # Here you can modify the number of images you want to download.
},
# If you run the pipeline several times with the cache set to the default settings,
# cached pipeline steps will be skipped.
cache=False,
)

# Download images component
download_images = ComponentOp.from_registry(name="download_images", arguments={})

download_images = ComponentOp.from_registry(
name="download_images",
arguments={"input_partition_rows": 1000, "resize_mode": "no"},
# If you run the pipeline several times with the cache set to the default settings,
# cached pipeline steps will be skipped.
cache=False,
)

# Add components to the pipeline
pipeline.add_op(load_from_hf_hub)
Expand Down

0 comments on commit 7575f04

Please sign in to comment.