From e564a341e8f437f3fb1e7e86c8904d63cc6bde45 Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Tue, 12 Dec 2023 11:53:00 +0100 Subject: [PATCH 1/4] Add LlamaHub load component from use case repo --- components/load_with_llamahub/Dockerfile | 24 ++++ .../load_with_llamahub/fondant_component.yaml | 47 ++++++++ .../load_with_llamahub/requirements.txt | 2 + components/load_with_llamahub/src/main.py | 110 ++++++++++++++++++ .../tests/component_test.py | 35 ++++++ .../tests/fondant_component.yaml | 50 ++++++++ .../load_with_llamahub/tests/pytest.ini | 2 + .../load_with_llamahub/tests/requirements.txt | 1 + 8 files changed, 271 insertions(+) create mode 100644 components/load_with_llamahub/Dockerfile create mode 100644 components/load_with_llamahub/fondant_component.yaml create mode 100644 components/load_with_llamahub/requirements.txt create mode 100644 components/load_with_llamahub/src/main.py create mode 100644 components/load_with_llamahub/tests/component_test.py create mode 100644 components/load_with_llamahub/tests/fondant_component.yaml create mode 100644 components/load_with_llamahub/tests/pytest.ini create mode 100644 components/load_with_llamahub/tests/requirements.txt diff --git a/components/load_with_llamahub/Dockerfile b/components/load_with_llamahub/Dockerfile new file mode 100644 index 000000000..a7851484b --- /dev/null +++ b/components/load_with_llamahub/Dockerfile @@ -0,0 +1,24 @@ +FROM --platform=linux/amd64 python:3.8-slim as base + +# System dependencies +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install git -y + +# Install requirements +COPY requirements.txt / +RUN pip3 install --no-cache-dir -r requirements.txt + +# Set the working directory to the component folder +WORKDIR /component +COPY src/ src/ + +FROM base as test +COPY tests/ tests/ +RUN pip3 install --no-cache-dir -r tests/requirements.txt +RUN python -m pytest tests + +FROM base +WORKDIR /component/src +ENTRYPOINT ["fondant", "execute", "main"] + diff --git a/components/load_with_llamahub/fondant_component.yaml b/components/load_with_llamahub/fondant_component.yaml new file mode 100644 index 000000000..b8da90ac2 --- /dev/null +++ b/components/load_with_llamahub/fondant_component.yaml @@ -0,0 +1,47 @@ +name: Load with LlamaHub +description: | + Load data using a LlamaHub loader. For available loaders, check the + [LlamaHub](https://llamahub.ai/). +image: ghcr.io/ml6team/load_with_llamahub:dev + +produces: + text: + type: string + #TODO: Add extra fields to extract from document metadata + +args: + loader_class: + description: | + The name of the LlamaIndex loader class to use. Make sure to provide the name and not the + id. The name is passed to `llama_index.download_loader` to download the specified loader. + type: str + loader_kwargs: + description: | + Keyword arguments to pass when instantiating the loader class. Check the documentation of + the loader to check which arguments it accepts. + type: str + load_kwargs: + description: | + Keyword arguments to pass to the `.load()` method of the loader. Check the documentation of + the loader to check which arguments it accepts. + type: str + additional_requirements: + description: | + Some loaders require additional dependencies to be installed. You can specify those here. + Use a format accepted by `pip install`. Eg. "pypdf" or "pypdf==3.17.1". Unfortunately + additional requirements for LlamaIndex loaders are not documented well, but if a dependency + is missing, a clear error message will be thrown. + type: list + default: [] + n_rows_to_load: + description: | + Optional argument that defines the number of rows to load. Useful for testing pipeline runs + on a small scale + type: int + default: None + index_column: + description: | + Column to set index to in the load component, if not specified a default globally unique + index will be set + type: str + default: None diff --git a/components/load_with_llamahub/requirements.txt b/components/load_with_llamahub/requirements.txt new file mode 100644 index 000000000..8356b3e35 --- /dev/null +++ b/components/load_with_llamahub/requirements.txt @@ -0,0 +1,2 @@ +fondant[component]==0.8.dev2 +llama-index==0.9.9 diff --git a/components/load_with_llamahub/src/main.py b/components/load_with_llamahub/src/main.py new file mode 100644 index 000000000..4089f4f76 --- /dev/null +++ b/components/load_with_llamahub/src/main.py @@ -0,0 +1,110 @@ +import logging +import subprocess +import sys +import typing as t +from collections import defaultdict + +import dask.dataframe as dd +import pandas as pd +from fondant.component import DaskLoadComponent +from fondant.core.component_spec import ComponentSpec +from llama_index import download_loader + +logger = logging.getLogger(__name__) + + +class LlamaHubReader(DaskLoadComponent): + def __init__( + self, + spec: ComponentSpec, + *, + loader_class: str, + loader_kwargs: dict, + load_kwargs: dict, + additional_requirements: t.List[str], + n_rows_to_load: t.Optional[int] = None, + index_column: t.Optional[str] = None, + ) -> None: + """ + Args: + spec: the component spec + loader_class: The name of the LlamaIndex loader class to use + loader_kwargs: Keyword arguments to pass when instantiating the loader class + load_kwargs: Keyword arguments to pass to the `.load()` method of the loader + additional_requirements: Additional Python requirements to install + n_rows_to_load: optional argument that defines the number of rows to load. + Useful for testing pipeline runs on a small scale. + index_column: Column to set index to in the load component, if not specified a default + globally unique index will be set. + """ + self.n_rows_to_load = n_rows_to_load + self.index_column = index_column + self.spec = spec + + self.install_additional_requirements(additional_requirements) + + loader_cls = download_loader(loader_class) + self.loader = loader_cls(**loader_kwargs) + self.load_kwargs = load_kwargs + + @staticmethod + def install_additional_requirements(additional_requirements: t.List[str]): + for requirement in additional_requirements: + subprocess.check_call( # nosec + [sys.executable, "-m", "pip", "install", requirement], + ) + + def set_df_index(self, dask_df: dd.DataFrame) -> dd.DataFrame: + if self.index_column is None: + logger.info( + "Index column not specified, setting a globally unique index", + ) + + def _set_unique_index(dataframe: pd.DataFrame, partition_info=None): + """Function that sets a unique index based on the partition and row number.""" + dataframe["id"] = 1 + dataframe["id"] = ( + str(partition_info["number"]) + + "_" + + (dataframe.id.cumsum()).astype(str) + ) + dataframe.index = dataframe.pop("id") + return dataframe + + def _get_meta_df() -> pd.DataFrame: + meta_dict = {"id": pd.Series(dtype="object")} + for field_name, field in self.spec.produces.items(): + meta_dict[field_name] = pd.Series( + dtype=pd.ArrowDtype(field.type.value), + ) + return pd.DataFrame(meta_dict).set_index("id") + + meta = _get_meta_df() + dask_df = dask_df.map_partitions(_set_unique_index, meta=meta) + else: + logger.info(f"Setting `{self.index_column}` as index") + dask_df = dask_df.set_index(self.index_column, drop=True) + + return dask_df + + def load(self) -> dd.DataFrame: + try: + documents = self.loader.lazy_load_data(**self.load_kwargs) + except NotImplementedError: + documents = self.loader.load_data(**self.load_kwargs) + + doc_dict = defaultdict(list) + for d, document in enumerate(documents): + for column in self.spec.produces: + if column == "text": + doc_dict["text"].append(document.text) + else: + doc_dict[column].append(document.metadata.get(column)) + + if d == self.n_rows_to_load: + break + + dask_df = dd.from_dict(doc_dict, npartitions=1) + + dask_df = self.set_df_index(dask_df) + return dask_df diff --git a/components/load_with_llamahub/tests/component_test.py b/components/load_with_llamahub/tests/component_test.py new file mode 100644 index 000000000..217b42281 --- /dev/null +++ b/components/load_with_llamahub/tests/component_test.py @@ -0,0 +1,35 @@ +from pathlib import Path + +import yaml +from fondant.core.component_spec import ComponentSpec + +from src.main import LlamaHubReader + + +def test_arxiv_reader(): + """Test the component with the ArxivReader. + + This test requires a stable internet connection, both to download the loader, and to download + the papers from Arxiv. + """ + with open(Path(__file__).with_name("fondant_component.yaml")) as f: + spec = yaml.safe_load(f) + spec = ComponentSpec(spec) + + component = LlamaHubReader( + spec=spec, + loader_class="ArxivReader", + loader_kwargs={}, + load_kwargs={ + "search_query": "jeff dean", + "max_results": 5, + }, + additional_requirements=["pypdf"], + n_rows_to_load=None, + index_column=None, + ) + + output_dataframe = component.load().compute() + + assert len(output_dataframe) > 0 + assert output_dataframe.columns.tolist() == ["text", "URL", "Title of this paper"] diff --git a/components/load_with_llamahub/tests/fondant_component.yaml b/components/load_with_llamahub/tests/fondant_component.yaml new file mode 100644 index 000000000..b0f34786f --- /dev/null +++ b/components/load_with_llamahub/tests/fondant_component.yaml @@ -0,0 +1,50 @@ +name: Load with LlamaHub +description: | + Load data using a LlamaHub loader. For available loaders, check the + [LlamaHub](https://llamahub.ai/). +image: ghcr.io/ml6team/load_with_llamahub:dev + +produces: + text: + type: string + URL: + type: string + Title of this paper: + type: string + +args: + loader_class: + description: | + The name of the LlamaIndex loader class to use. Make sure to provide the name and not the + id. The name is passed to `llama_index.download_loader` to download the specified loader. + type: str + loader_kwargs: + description: | + Keyword arguments to pass when instantiating the loader class. Check the documentation of + the loader to check which arguments it accepts. + type: str + load_kwargs: + description: | + Keyword arguments to pass to the `.load()` method of the loader. Check the documentation of + the loader to check which arguments it accepts. + type: str + additional_requirements: + description: | + Some loaders require additional dependencies to be installed. You can specify those here. + Use a format accepted by `pip install`. Eg. "pypdf" or "pypdf==3.17.1". Unfortunately + additional requirements for LlamaIndex loaders are not documented well, but if a dependency + is missing, a clear error message will be thrown. + type: list + default: [] + n_rows_to_load: + description: | + Optional argument that defines the number of rows to load. Useful for testing pipeline runs + on a small scale + type: int + default: None + index_column: + description: | + Column to set index to in the load component, if not specified a default globally unique + index will be set + type: str + default: None diff --git a/components/load_with_llamahub/tests/pytest.ini b/components/load_with_llamahub/tests/pytest.ini new file mode 100644 index 000000000..bf6a8a517 --- /dev/null +++ b/components/load_with_llamahub/tests/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +pythonpath = ../src \ No newline at end of file diff --git a/components/load_with_llamahub/tests/requirements.txt b/components/load_with_llamahub/tests/requirements.txt new file mode 100644 index 000000000..2a929edcc --- /dev/null +++ b/components/load_with_llamahub/tests/requirements.txt @@ -0,0 +1 @@ +pytest==7.4.2 From e258d5d78d1533debb9791758ff9ef7ef262342c Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Tue, 12 Dec 2023 11:59:49 +0100 Subject: [PATCH 2/4] Update component as reusable component --- components/load_with_llamahub/Dockerfile | 5 ++ components/load_with_llamahub/README.md | 56 +++++++++++++++++++ .../load_with_llamahub/fondant_component.yaml | 8 +-- .../load_with_llamahub/requirements.txt | 1 - 4 files changed, 65 insertions(+), 5 deletions(-) create mode 100644 components/load_with_llamahub/README.md diff --git a/components/load_with_llamahub/Dockerfile b/components/load_with_llamahub/Dockerfile index a7851484b..5de6e945f 100644 --- a/components/load_with_llamahub/Dockerfile +++ b/components/load_with_llamahub/Dockerfile @@ -9,6 +9,11 @@ RUN apt-get update && \ COPY requirements.txt / RUN pip3 install --no-cache-dir -r requirements.txt +# Install Fondant +# This is split from other requirements to leverage caching +ARG FONDANT_VERSION=main +RUN pip3 install fondant[component,aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} + # Set the working directory to the component folder WORKDIR /component COPY src/ src/ diff --git a/components/load_with_llamahub/README.md b/components/load_with_llamahub/README.md new file mode 100644 index 000000000..5c46c7656 --- /dev/null +++ b/components/load_with_llamahub/README.md @@ -0,0 +1,56 @@ +# Load with LlamaHub + +### Description +Load data using a LlamaHub loader. For available loaders, check the +[LlamaHub](https://llamahub.ai/). + + +### Inputs / outputs + +**This component consumes no data.** + +**This component produces no data.** + +### Arguments + +The component takes the following arguments to alter its behavior: + +| argument | type | description | default | +| -------- | ---- | ----------- | ------- | +| loader_class | str | The name of the LlamaIndex loader class to use. Make sure to provide the name and not the id. The name is passed to `llama_index.download_loader` to download the specified loader. | / | +| loader_kwargs | str | Keyword arguments to pass when instantiating the loader class. Check the documentation of the loader to check which arguments it accepts. | / | +| load_kwargs | str | Keyword arguments to pass to the `.load()` method of the loader. Check the documentation ofthe loader to check which arguments it accepts. | / | +| additional_requirements | list | Some loaders require additional dependencies to be installed. You can specify those here. Use a format accepted by `pip install`. Eg. "pypdf" or "pypdf==3.17.1". Unfortunately additional requirements for LlamaIndex loaders are not documented well, but if a dependencyis missing, a clear error message will be thrown. | / | +| n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | / | +| index_column | str | Column to set index to in the load component, if not specified a default globally unique index will be set | / | + +### Usage + +You can add this component to your pipeline using the following code: + +```python +from fondant.pipeline import Pipeline + + +pipeline = Pipeline(...) + +dataset = pipeline.read( + "load_with_llamahub", + arguments={ + # Add arguments + # "loader_class": , + # "loader_kwargs": , + # "load_kwargs": , + # "additional_requirements": [], + # "n_rows_to_load": 0, + # "index_column": , + } +) +``` + +### Testing + +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test +``` diff --git a/components/load_with_llamahub/fondant_component.yaml b/components/load_with_llamahub/fondant_component.yaml index b8da90ac2..ca16ff794 100644 --- a/components/load_with_llamahub/fondant_component.yaml +++ b/components/load_with_llamahub/fondant_component.yaml @@ -2,12 +2,12 @@ name: Load with LlamaHub description: | Load data using a LlamaHub loader. For available loaders, check the [LlamaHub](https://llamahub.ai/). -image: ghcr.io/ml6team/load_with_llamahub:dev +image: fndnt/load_with_llamahub:dev +tags: + - Data loading produces: - text: - type: string - #TODO: Add extra fields to extract from document metadata + additionalProperties: true args: loader_class: diff --git a/components/load_with_llamahub/requirements.txt b/components/load_with_llamahub/requirements.txt index 8356b3e35..3a7971f8f 100644 --- a/components/load_with_llamahub/requirements.txt +++ b/components/load_with_llamahub/requirements.txt @@ -1,2 +1 @@ -fondant[component]==0.8.dev2 llama-index==0.9.9 From de42e9ee2a4e3ee3f44aed0321c76fe7efd073d5 Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Tue, 12 Dec 2023 14:23:20 +0100 Subject: [PATCH 3/4] Limit readme pre-commit to top level component specs --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6712e4330..f53ae280e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -68,5 +68,5 @@ repos: name: Generate component READMEs language: python entry: python scripts/component_readme/generate_readme.py - files: ^components/.*/fondant_component.yaml + files: ^components/[^/]*/fondant_component.yaml additional_dependencies: ["fondant@git+https://github.com/ml6team/fondant@main", "Jinja2==3.1.2"] \ No newline at end of file From 1ed821b13fa820965916e6536e6abdebafa3c92a Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Tue, 12 Dec 2023 12:13:45 +0100 Subject: [PATCH 4/4] Update hub links in documentation after adding new components --- .../download_images/fondant_component.yaml | 2 +- components/load_from_csv/README.md | 2 +- .../load_from_csv/fondant_component.yaml | 2 +- components/load_from_hf_hub/README.md | 2 +- .../load_from_hf_hub/fondant_component.yaml | 2 +- components/write_to_hf_hub/README.md | 2 +- .../write_to_hf_hub/fondant_component.yaml | 2 +- docs/components/hub.md | 28 +++++++++++++++---- docs/guides/build_a_simple_pipeline.md | 25 +++++++++-------- docs/guides/implement_custom_components.md | 2 +- 10 files changed, 44 insertions(+), 25 deletions(-) diff --git a/components/download_images/fondant_component.yaml b/components/download_images/fondant_component.yaml index 7a230a527..47b178063 100644 --- a/components/download_images/fondant_component.yaml +++ b/components/download_images/fondant_component.yaml @@ -10,7 +10,7 @@ description: | image: fndnt/download_images:dev tags: - - Image processing + - Data retrieval consumes: image_url: diff --git a/components/load_from_csv/README.md b/components/load_from_csv/README.md index b444628f7..ce52647ee 100644 --- a/components/load_from_csv/README.md +++ b/components/load_from_csv/README.md @@ -1,4 +1,4 @@ -# Load from csv file +# Load from csv ### Description Component that loads a dataset from a csv file diff --git a/components/load_from_csv/fondant_component.yaml b/components/load_from_csv/fondant_component.yaml index 343706ec4..4c27c6d6a 100644 --- a/components/load_from_csv/fondant_component.yaml +++ b/components/load_from_csv/fondant_component.yaml @@ -1,4 +1,4 @@ -name: Load from csv file +name: Load from csv description: Component that loads a dataset from a csv file image: fndnt/load_from_csv:dev tags: diff --git a/components/load_from_hf_hub/README.md b/components/load_from_hf_hub/README.md index 62b28af83..2fe9a8f6c 100644 --- a/components/load_from_hf_hub/README.md +++ b/components/load_from_hf_hub/README.md @@ -1,4 +1,4 @@ -# Load from hub +# Load from Hugging Face hub ### Description Component that loads a dataset from the hub diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index f0e1bcdb4..19fc612c8 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -1,4 +1,4 @@ -name: Load from hub +name: Load from Hugging Face hub description: Component that loads a dataset from the hub image: fndnt/load_from_hf_hub:dev tags: diff --git a/components/write_to_hf_hub/README.md b/components/write_to_hf_hub/README.md index 5197cbd32..05268b782 100644 --- a/components/write_to_hf_hub/README.md +++ b/components/write_to_hf_hub/README.md @@ -1,4 +1,4 @@ -# Write to hub +# Write to Hugging Face hub ### Description Component that writes a dataset to the hub diff --git a/components/write_to_hf_hub/fondant_component.yaml b/components/write_to_hf_hub/fondant_component.yaml index 74169d483..61af7e0ef 100644 --- a/components/write_to_hf_hub/fondant_component.yaml +++ b/components/write_to_hf_hub/fondant_component.yaml @@ -1,4 +1,4 @@ -name: Write to hub +name: Write to Hugging Face hub description: Component that writes a dataset to the hub image: fndnt/write_to_hf_hub:dev tags: diff --git a/docs/components/hub.md b/docs/components/hub.md index 40f04ec2e..54fd562cc 100644 --- a/docs/components/hub.md +++ b/docs/components/hub.md @@ -8,11 +8,15 @@ Below you can find the reusable components offered by Fondant. **Data loading** +??? "Load from csv" + + --8<-- "components/load_from_csv/README.md:1" + ??? "Load from files" --8<-- "components/load_from_files/README.md:1" -??? "Load from hub" +??? "Load from Hugging Face hub" --8<-- "components/load_from_hf_hub/README.md:1" @@ -20,8 +24,20 @@ Below you can find the reusable components offered by Fondant. --8<-- "components/load_from_parquet/README.md:1" +??? "Load with LlamaHub" + + --8<-- "components/load_with_llamahub/README.md:1" + **Data retrieval** +??? "Download images" + + --8<-- "components/download_images/README.md:1" + +??? "retrieve_from_weaviate" + + --8<-- "components/retrieve_from_weaviate/README.md:1" + ??? "Embedding based LAION retrieval" --8<-- "components/retrieve_laion_by_embedding/README.md:1" @@ -40,7 +56,7 @@ Below you can find the reusable components offered by Fondant. --8<-- "components/index_weaviate/README.md:1" -??? "Write to hub" +??? "Write to Hugging Face hub" --8<-- "components/write_to_hf_hub/README.md:1" @@ -54,10 +70,6 @@ Below you can find the reusable components offered by Fondant. --8<-- "components/crop_images/README.md:1" -??? "Download images" - - --8<-- "components/download_images/README.md:1" - ??? "Embed images" --8<-- "components/embed_images/README.md:1" @@ -88,6 +100,10 @@ Below you can find the reusable components offered by Fondant. --8<-- "components/embed_text/README.md:1" +??? "retriever_eval_ragas" + + --8<-- "components/evaluate_ragas/README.md:1" + ??? "Filter languages" --8<-- "components/filter_language/README.md:1" diff --git a/docs/guides/build_a_simple_pipeline.md b/docs/guides/build_a_simple_pipeline.md index d424e8c02..2fef3eb7f 100644 --- a/docs/guides/build_a_simple_pipeline.md +++ b/docs/guides/build_a_simple_pipeline.md @@ -8,14 +8,17 @@ the Fondant hub. ## Overview In this guide, we will build a pipeline that downloads images from the -[fondant-cc-25m](https://huggingface.co/datasets/fondant-ai/fondant-cc-25m) dataset. +[fondant-cc-25m](https://huggingface.co/datasets/fondant-ai/fondant-cc-25m) dataset and filters +them. -It consists of two steps: +It consists of three steps: -* **[load_from_hf_hub](https://github.com/ml6team/fondant/tree/main/components/load_from_hf_hub)**: +* **[load_from_hf_hub](../components/hub.md#description_2)**: Loads the dataset containing image urls from the Huggingface hub. -* **[download_images](https://github.com/ml6team/fondant/tree/main/components/download_images)**: +* **[download_images](../components/hub.md#description_5)**: Downloads images from the image urls. +* **[filter_language](../components/hub.md#description_22)**: + Filters the images based on the alt text language ## Setting up the environment @@ -64,7 +67,7 @@ If you want to learn more about components, you can check out the As a first step, we want to read data into our pipeline. In this case, we will load a dataset from the HuggingFace Hub. For this, we can use the reusable -[load_from_hf_hub](../components/hub.md#description_1) component. +[load_from_hf_hub](../components/hub.md#description_2) component. We can read data into our pipeline using the `Pipeline.read()` method, which returns a (lazy) `Dataset`. @@ -92,10 +95,10 @@ We provide three arguments to the `.read()` method: - The name of the reusable component - Some arguments to configure the component. Check the component's - [documentation](../components/hub.md#arguments_1) for the supported arguments + [documentation](../components/hub.md#arguments_2) for the supported arguments - The schema of the data the component will produce. This is necessary for this specific component since the output is dynamic based on the dataset being loaded. You can see this - defined in the component [documentation](../components/hub.md#inputs-outputs_1) with + defined in the component [documentation](../components/hub.md#inputs-outputs_2) with `additionalProperties: true` under the produces section. ??? "View a detailed reference of the `Pipeline.read()` method" @@ -128,7 +131,7 @@ directly, we must download each of them. Downloading images is a common requirement across various use cases, which is why Fondant provides a reusable component specifically for this purpose. This component is appropriately named -[download_images](../components/hub.md#description_10). +[download_images](../components/hub.md#description_5). We can add this component to our pipeline as follows: @@ -138,7 +141,7 @@ images = dataset.apply( ) ``` -Looking at the component [documentation](../components/hub.md#inputs-outputs_1), we can see that +Looking at the component [documentation](../components/hub.md#inputs-outputs_5), we can see that it expects an `"image_url"` field, which was generated by our previous component. This means that we can simply chain the components as-is. @@ -146,8 +149,8 @@ that we can simply chain the components as-is. This won't always be the case though. We now want to filter our dataset for images that contain English alt text. For this, we leverage the -[filter_language](../components/hub.md#description_18) component. Looking at the component -[documentation](../components/hub.md#inputs-outputs_18), we can see that it expects an `"text"` +[filter_language](../components/hub.md#description_22) component. Looking at the component +[documentation](../components/hub.md#inputs-outputs_22), we can see that it expects an `"text"` field, while we would like to apply it to the `"alt_text"` field in our dataset. We can easily achieve this using the `consumes` argument, which lets us maps the fields that the diff --git a/docs/guides/implement_custom_components.md b/docs/guides/implement_custom_components.md index 28346f656..e7a7fb869 100644 --- a/docs/guides/implement_custom_components.md +++ b/docs/guides/implement_custom_components.md @@ -245,5 +245,5 @@ We now have a pipeline that downloads a dataset from the HuggingFace hub, filter image type, downloads the images, and filters them by alt text language. One final step still remaining, is to write teh final dataset to its destination. You could for -instance use the [`write_to_hf_hub`](../components/hub.md#description_7) component to write it to +instance use the [`write_to_hf_hub`](../components/hub.md#description_11) component to write it to the HuggingFace Hub, or create a custom `WriteComponent`.