-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Draft PR for implementing generic read/write components. Note: the changes from `feature/local-starcoder` were merged to this branch to make testing easier. Merge this PR after the local startcoder PR. Things to do: - [x] Modify `from_registry` method to enable passing path to custom spec - [x] Figure out how to present spec template to user. Should we keep the image/args and just add a todo in the `produces` and `consumes` (open for suggestions) - [x] Switch over all pipelines to use the generic load/write and remove the custom ones - [x] Add components to affected pipelines to add missing metadata (e.g. width/height) - [x] Add documentation
- Loading branch information
1 parent
e8d672e
commit 3e25077
Showing
36 changed files
with
675 additions
and
234 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
19 changes: 19 additions & 0 deletions
19
components/image_resolution_extraction/fondant_component.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
name: Image resolution extraction | ||
description: Component that extracts image resolution data from the images | ||
image: ghcr.io/ml6team/image_resolution_extraction:latest | ||
|
||
consumes: | ||
images: | ||
fields: | ||
data: | ||
type: binary | ||
|
||
produces: | ||
images: | ||
fields: | ||
width: | ||
type: int16 | ||
height: | ||
type: int16 | ||
data: | ||
type: binary |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
fondant | ||
pyarrow>=7.0 | ||
gcsfs==2023.4.0 | ||
imagesize==1.4.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
"""This component filters images of the dataset based on image size (minimum height and width).""" | ||
import io | ||
import logging | ||
import typing as t | ||
|
||
import imagesize | ||
import numpy as np | ||
import pandas as pd | ||
|
||
from fondant.component import PandasTransformComponent | ||
from fondant.logger import configure_logging | ||
|
||
configure_logging() | ||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def extract_dimensions(images: bytes) -> t.Tuple[np.int16, np.int16]: | ||
"""Extract the width and height of an image. | ||
Args: | ||
images: input dataframe with images_data column | ||
Returns: | ||
np.int16: width of the image | ||
np.int16: height of the image | ||
""" | ||
width, height = imagesize.get(io.BytesIO(images)) | ||
|
||
return np.int16(width), np.int16(height) | ||
|
||
|
||
class ImageResolutionExtractionComponent(PandasTransformComponent): | ||
"""Component that extracts image dimensions.""" | ||
|
||
def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Args: | ||
dataframe: Dask dataframe | ||
Returns: | ||
dataset. | ||
""" | ||
logger.info("Filtering dataset...") | ||
|
||
dataframe[[("images", "width"), ("images", "height")]] = \ | ||
dataframe[[("images", "data")]].map(extract_dimensions) | ||
|
||
return dataframe | ||
|
||
|
||
if __name__ == "__main__": | ||
component = ImageResolutionExtractionComponent.from_args() | ||
component.run() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
name: Write to hub | ||
description: Component that writes a dataset to the hub | ||
image: ghcr.io/ml6team/write_to_hf_hub:latest | ||
|
||
consumes: | ||
dummy_variable: #TODO: fill in here | ||
fields: | ||
data: | ||
type: binary | ||
|
||
args: | ||
hf_token: | ||
description: The hugging face token used to write to the hub | ||
type: str | ||
username: | ||
description: The username under which to upload the dataset | ||
type: str | ||
dataset_name: | ||
description: The name of the dataset to upload | ||
type: str | ||
image_column_names: | ||
description: A list containing the image column names. Used to format to image to HF hub format | ||
type: list | ||
default: None | ||
column_name_mapping: | ||
description: Mapping of the consumed fondant column names to the written hub column names | ||
type: dict | ||
default: None |
1 change: 1 addition & 0 deletions
1
.../write_to_hub_controlnet/requirements.txt → components/write_to_hf_hub/requirements.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
huggingface_hub==0.14.1 | ||
datasets==2.10.1 | ||
fondant | ||
pyarrow>=7.0 | ||
Pillow==9.4.0 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
"""This component writes an image dataset to the hub.""" | ||
import logging | ||
import typing as t | ||
from io import BytesIO | ||
|
||
import dask.dataframe as dd | ||
import datasets | ||
|
||
# Define the schema for the struct using PyArrow | ||
import huggingface_hub | ||
from PIL import Image | ||
|
||
from fondant.component import WriteComponent | ||
from fondant.logger import configure_logging | ||
|
||
configure_logging() | ||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def convert_bytes_to_image(image_bytes: bytes, feature_encoder: datasets.Image) -> \ | ||
t.Dict[str, t.Any]: | ||
""" | ||
Function that converts image bytes to hf image format | ||
Args: | ||
image_bytes: the images as a bytestring | ||
feature_encoder: hf image feature encoder | ||
Returns: | ||
HF image representation. | ||
""" | ||
image = Image.open(BytesIO(image_bytes)) | ||
image = feature_encoder.encode_example(image) | ||
return image | ||
|
||
|
||
class WriteToHubComponent(WriteComponent): | ||
def write( | ||
self, | ||
dataframe: dd.DataFrame, | ||
*, | ||
hf_token: str, | ||
username: str, | ||
dataset_name: str, | ||
image_column_names: t.Optional[list], | ||
column_name_mapping: t.Optional[dict] | ||
): | ||
""" | ||
Args: | ||
dataframe: Dask dataframe | ||
hf_token: The hugging face token used to write to the hub | ||
username: The username under which to upload the dataset | ||
dataset_name: The name of the dataset to upload | ||
image_column_names: A list containing the subset image column names. Used to format the | ||
image fields to HF hub format | ||
column_name_mapping: Mapping of the consumed fondant column names to the written hub | ||
column names. | ||
""" | ||
# login | ||
huggingface_hub.login(token=hf_token) | ||
|
||
# Create HF dataset repository | ||
repo_id = f"{username}/{dataset_name}" | ||
repo_path = f"hf://datasets/{repo_id}" | ||
logger.info(f"Creating HF dataset repository under ID: '{repo_id}'") | ||
huggingface_hub.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True) | ||
|
||
# Get columns to write and schema | ||
write_columns = [] | ||
schema_dict = {} | ||
for subset_name, subset in self.spec.consumes.items(): | ||
for field in subset.fields.values(): | ||
column_name = f"{subset_name}_{field.name}" | ||
write_columns.append(column_name) | ||
if image_column_names and column_name in image_column_names: | ||
schema_dict[column_name] = datasets.Image() | ||
else: | ||
schema_dict[column_name] = datasets.Value(str(field.type.value)) | ||
|
||
schema = datasets.Features(schema_dict).arrow_schema | ||
dataframe = dataframe[write_columns] | ||
|
||
# Map image column to hf data format | ||
feature_encoder = datasets.Image(decode=True) | ||
|
||
if image_column_names is not None: | ||
for image_column_name in image_column_names: | ||
dataframe[image_column_name] = dataframe[image_column_name].map( | ||
lambda x: convert_bytes_to_image(x, feature_encoder), | ||
meta=(image_column_name, "object") | ||
) | ||
|
||
# Map column names to hf data format | ||
if column_name_mapping: | ||
dataframe = dataframe.rename(columns=column_name_mapping) | ||
|
||
# Write dataset to the hub | ||
dd.to_parquet(dataframe, path=f"{repo_path}/data", schema=schema) | ||
|
||
|
||
if __name__ == "__main__": | ||
component = WriteToHubComponent.from_args() | ||
component.run() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.