From e564a341e8f437f3fb1e7e86c8904d63cc6bde45 Mon Sep 17 00:00:00 2001
From: Robbe Sneyders <robbe.sneyders@ml6.eu>
Date: Tue, 12 Dec 2023 11:53:00 +0100
Subject: [PATCH 1/4] Add LlamaHub load component from use case repo

---
 components/load_with_llamahub/Dockerfile      |  24 ++++
 .../load_with_llamahub/fondant_component.yaml |  47 ++++++++
 .../load_with_llamahub/requirements.txt       |   2 +
 components/load_with_llamahub/src/main.py     | 110 ++++++++++++++++++
 .../tests/component_test.py                   |  35 ++++++
 .../tests/fondant_component.yaml              |  50 ++++++++
 .../load_with_llamahub/tests/pytest.ini       |   2 +
 .../load_with_llamahub/tests/requirements.txt |   1 +
 8 files changed, 271 insertions(+)
 create mode 100644 components/load_with_llamahub/Dockerfile
 create mode 100644 components/load_with_llamahub/fondant_component.yaml
 create mode 100644 components/load_with_llamahub/requirements.txt
 create mode 100644 components/load_with_llamahub/src/main.py
 create mode 100644 components/load_with_llamahub/tests/component_test.py
 create mode 100644 components/load_with_llamahub/tests/fondant_component.yaml
 create mode 100644 components/load_with_llamahub/tests/pytest.ini
 create mode 100644 components/load_with_llamahub/tests/requirements.txt

diff --git a/components/load_with_llamahub/Dockerfile b/components/load_with_llamahub/Dockerfile
new file mode 100644
index 000000000..a7851484b
--- /dev/null
+++ b/components/load_with_llamahub/Dockerfile
@@ -0,0 +1,24 @@
+FROM --platform=linux/amd64 python:3.8-slim as base
+
+# System dependencies
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install git -y
+
+# Install requirements
+COPY requirements.txt /
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# Set the working directory to the component folder
+WORKDIR /component
+COPY src/ src/
+
+FROM base as test
+COPY tests/ tests/
+RUN pip3 install --no-cache-dir -r tests/requirements.txt
+RUN python -m pytest tests
+
+FROM base
+WORKDIR /component/src
+ENTRYPOINT ["fondant", "execute", "main"]
+
diff --git a/components/load_with_llamahub/fondant_component.yaml b/components/load_with_llamahub/fondant_component.yaml
new file mode 100644
index 000000000..b8da90ac2
--- /dev/null
+++ b/components/load_with_llamahub/fondant_component.yaml
@@ -0,0 +1,47 @@
+name: Load with LlamaHub
+description: |
+  Load data using a LlamaHub loader. For available loaders, check the 
+  [LlamaHub](https://llamahub.ai/).
+image: ghcr.io/ml6team/load_with_llamahub:dev
+
+produces:
+  text:
+    type: string
+  #TODO: Add extra fields to extract from document metadata
+
+args:
+  loader_class:
+    description: |
+      The name of the LlamaIndex loader class to use. Make sure to provide the name and not the 
+      id. The name is passed to `llama_index.download_loader` to download the specified loader.
+    type: str
+  loader_kwargs:
+    description: |
+      Keyword arguments to pass when instantiating the loader class. Check the documentation of 
+      the loader to check which arguments it accepts.
+    type: str
+  load_kwargs:
+    description: |
+      Keyword arguments to pass to the `.load()` method of the loader. Check the documentation of
+      the loader to check which arguments it accepts.
+    type: str
+  additional_requirements:
+    description: |
+      Some loaders require additional dependencies to be installed. You can specify those here. 
+      Use a format accepted by `pip install`. Eg. "pypdf" or "pypdf==3.17.1". Unfortunately 
+      additional requirements for LlamaIndex loaders are not documented well, but if a dependency
+      is missing, a clear error message will be thrown.
+    type: list
+    default: []
+  n_rows_to_load:
+    description: |
+      Optional argument that defines the number of rows to load. Useful for testing pipeline runs 
+      on a small scale
+    type: int
+    default: None
+  index_column:
+    description: |
+      Column to set index to in the load component, if not specified a default globally unique 
+      index will be set
+    type: str
+    default: None
diff --git a/components/load_with_llamahub/requirements.txt b/components/load_with_llamahub/requirements.txt
new file mode 100644
index 000000000..8356b3e35
--- /dev/null
+++ b/components/load_with_llamahub/requirements.txt
@@ -0,0 +1,2 @@
+fondant[component]==0.8.dev2
+llama-index==0.9.9
diff --git a/components/load_with_llamahub/src/main.py b/components/load_with_llamahub/src/main.py
new file mode 100644
index 000000000..4089f4f76
--- /dev/null
+++ b/components/load_with_llamahub/src/main.py
@@ -0,0 +1,110 @@
+import logging
+import subprocess
+import sys
+import typing as t
+from collections import defaultdict
+
+import dask.dataframe as dd
+import pandas as pd
+from fondant.component import DaskLoadComponent
+from fondant.core.component_spec import ComponentSpec
+from llama_index import download_loader
+
+logger = logging.getLogger(__name__)
+
+
+class LlamaHubReader(DaskLoadComponent):
+    def __init__(
+        self,
+        spec: ComponentSpec,
+        *,
+        loader_class: str,
+        loader_kwargs: dict,
+        load_kwargs: dict,
+        additional_requirements: t.List[str],
+        n_rows_to_load: t.Optional[int] = None,
+        index_column: t.Optional[str] = None,
+    ) -> None:
+        """
+        Args:
+            spec: the component spec
+            loader_class: The name of the LlamaIndex loader class to use
+            loader_kwargs: Keyword arguments to pass when instantiating the loader class
+            load_kwargs: Keyword arguments to pass to the `.load()` method of the loader
+            additional_requirements: Additional Python requirements to install
+            n_rows_to_load: optional argument that defines the number of rows to load.
+                Useful for testing pipeline runs on a small scale.
+            index_column: Column to set index to in the load component, if not specified a default
+                globally unique index will be set.
+        """
+        self.n_rows_to_load = n_rows_to_load
+        self.index_column = index_column
+        self.spec = spec
+
+        self.install_additional_requirements(additional_requirements)
+
+        loader_cls = download_loader(loader_class)
+        self.loader = loader_cls(**loader_kwargs)
+        self.load_kwargs = load_kwargs
+
+    @staticmethod
+    def install_additional_requirements(additional_requirements: t.List[str]):
+        for requirement in additional_requirements:
+            subprocess.check_call(  # nosec
+                [sys.executable, "-m", "pip", "install", requirement],
+            )
+
+    def set_df_index(self, dask_df: dd.DataFrame) -> dd.DataFrame:
+        if self.index_column is None:
+            logger.info(
+                "Index column not specified, setting a globally unique index",
+            )
+
+            def _set_unique_index(dataframe: pd.DataFrame, partition_info=None):
+                """Function that sets a unique index based on the partition and row number."""
+                dataframe["id"] = 1
+                dataframe["id"] = (
+                    str(partition_info["number"])
+                    + "_"
+                    + (dataframe.id.cumsum()).astype(str)
+                )
+                dataframe.index = dataframe.pop("id")
+                return dataframe
+
+            def _get_meta_df() -> pd.DataFrame:
+                meta_dict = {"id": pd.Series(dtype="object")}
+                for field_name, field in self.spec.produces.items():
+                    meta_dict[field_name] = pd.Series(
+                        dtype=pd.ArrowDtype(field.type.value),
+                    )
+                return pd.DataFrame(meta_dict).set_index("id")
+
+            meta = _get_meta_df()
+            dask_df = dask_df.map_partitions(_set_unique_index, meta=meta)
+        else:
+            logger.info(f"Setting `{self.index_column}` as index")
+            dask_df = dask_df.set_index(self.index_column, drop=True)
+
+        return dask_df
+
+    def load(self) -> dd.DataFrame:
+        try:
+            documents = self.loader.lazy_load_data(**self.load_kwargs)
+        except NotImplementedError:
+            documents = self.loader.load_data(**self.load_kwargs)
+
+        doc_dict = defaultdict(list)
+        for d, document in enumerate(documents):
+            for column in self.spec.produces:
+                if column == "text":
+                    doc_dict["text"].append(document.text)
+                else:
+                    doc_dict[column].append(document.metadata.get(column))
+
+            if d == self.n_rows_to_load:
+                break
+
+        dask_df = dd.from_dict(doc_dict, npartitions=1)
+
+        dask_df = self.set_df_index(dask_df)
+        return dask_df
diff --git a/components/load_with_llamahub/tests/component_test.py b/components/load_with_llamahub/tests/component_test.py
new file mode 100644
index 000000000..217b42281
--- /dev/null
+++ b/components/load_with_llamahub/tests/component_test.py
@@ -0,0 +1,35 @@
+from pathlib import Path
+
+import yaml
+from fondant.core.component_spec import ComponentSpec
+
+from src.main import LlamaHubReader
+
+
+def test_arxiv_reader():
+    """Test the component with the ArxivReader.
+
+    This test requires a stable internet connection, both to download the loader, and to download
+    the papers from Arxiv.
+    """
+    with open(Path(__file__).with_name("fondant_component.yaml")) as f:
+        spec = yaml.safe_load(f)
+    spec = ComponentSpec(spec)
+
+    component = LlamaHubReader(
+        spec=spec,
+        loader_class="ArxivReader",
+        loader_kwargs={},
+        load_kwargs={
+            "search_query": "jeff dean",
+            "max_results": 5,
+        },
+        additional_requirements=["pypdf"],
+        n_rows_to_load=None,
+        index_column=None,
+    )
+
+    output_dataframe = component.load().compute()
+
+    assert len(output_dataframe) > 0
+    assert output_dataframe.columns.tolist() == ["text", "URL", "Title of this paper"]
diff --git a/components/load_with_llamahub/tests/fondant_component.yaml b/components/load_with_llamahub/tests/fondant_component.yaml
new file mode 100644
index 000000000..b0f34786f
--- /dev/null
+++ b/components/load_with_llamahub/tests/fondant_component.yaml
@@ -0,0 +1,50 @@
+name: Load with LlamaHub
+description: |
+  Load data using a LlamaHub loader. For available loaders, check the 
+  [LlamaHub](https://llamahub.ai/).
+image: ghcr.io/ml6team/load_with_llamahub:dev
+
+produces:
+  text:
+    type: string
+  URL:
+    type: string
+  Title of this paper:
+    type: string
+
+args:
+  loader_class:
+    description: |
+      The name of the LlamaIndex loader class to use. Make sure to provide the name and not the 
+      id. The name is passed to `llama_index.download_loader` to download the specified loader.
+    type: str
+  loader_kwargs:
+    description: |
+      Keyword arguments to pass when instantiating the loader class. Check the documentation of 
+      the loader to check which arguments it accepts.
+    type: str
+  load_kwargs:
+    description: |
+      Keyword arguments to pass to the `.load()` method of the loader. Check the documentation of
+      the loader to check which arguments it accepts.
+    type: str
+  additional_requirements:
+    description: |
+      Some loaders require additional dependencies to be installed. You can specify those here. 
+      Use a format accepted by `pip install`. Eg. "pypdf" or "pypdf==3.17.1". Unfortunately 
+      additional requirements for LlamaIndex loaders are not documented well, but if a dependency
+      is missing, a clear error message will be thrown.
+    type: list
+    default: []
+  n_rows_to_load:
+    description: |
+      Optional argument that defines the number of rows to load. Useful for testing pipeline runs 
+      on a small scale
+    type: int
+    default: None
+  index_column:
+    description: |
+      Column to set index to in the load component, if not specified a default globally unique 
+      index will be set
+    type: str
+    default: None
diff --git a/components/load_with_llamahub/tests/pytest.ini b/components/load_with_llamahub/tests/pytest.ini
new file mode 100644
index 000000000..bf6a8a517
--- /dev/null
+++ b/components/load_with_llamahub/tests/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+pythonpath = ../src
\ No newline at end of file
diff --git a/components/load_with_llamahub/tests/requirements.txt b/components/load_with_llamahub/tests/requirements.txt
new file mode 100644
index 000000000..2a929edcc
--- /dev/null
+++ b/components/load_with_llamahub/tests/requirements.txt
@@ -0,0 +1 @@
+pytest==7.4.2

From e258d5d78d1533debb9791758ff9ef7ef262342c Mon Sep 17 00:00:00 2001
From: Robbe Sneyders <robbe.sneyders@ml6.eu>
Date: Tue, 12 Dec 2023 11:59:49 +0100
Subject: [PATCH 2/4] Update component as reusable component

---
 components/load_with_llamahub/Dockerfile      |  5 ++
 components/load_with_llamahub/README.md       | 56 +++++++++++++++++++
 .../load_with_llamahub/fondant_component.yaml |  8 +--
 .../load_with_llamahub/requirements.txt       |  1 -
 4 files changed, 65 insertions(+), 5 deletions(-)
 create mode 100644 components/load_with_llamahub/README.md

diff --git a/components/load_with_llamahub/Dockerfile b/components/load_with_llamahub/Dockerfile
index a7851484b..5de6e945f 100644
--- a/components/load_with_llamahub/Dockerfile
+++ b/components/load_with_llamahub/Dockerfile
@@ -9,6 +9,11 @@ RUN apt-get update && \
 COPY requirements.txt /
 RUN pip3 install --no-cache-dir -r requirements.txt
 
+# Install Fondant
+# This is split from other requirements to leverage caching
+ARG FONDANT_VERSION=main
+RUN pip3 install fondant[component,aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
+
 # Set the working directory to the component folder
 WORKDIR /component
 COPY src/ src/
diff --git a/components/load_with_llamahub/README.md b/components/load_with_llamahub/README.md
new file mode 100644
index 000000000..5c46c7656
--- /dev/null
+++ b/components/load_with_llamahub/README.md
@@ -0,0 +1,56 @@
+# Load with LlamaHub
+
+### Description
+Load data using a LlamaHub loader. For available loaders, check the 
+[LlamaHub](https://llamahub.ai/).
+
+
+### Inputs / outputs
+
+**This component consumes no data.**
+
+**This component produces no data.**
+
+### Arguments
+
+The component takes the following arguments to alter its behavior:
+
+| argument | type | description | default |
+| -------- | ---- | ----------- | ------- |
+| loader_class | str | The name of the LlamaIndex loader class to use. Make sure to provide the name and not the id. The name is passed to `llama_index.download_loader` to download the specified loader. | / |
+| loader_kwargs | str | Keyword arguments to pass when instantiating the loader class. Check the documentation of the loader to check which arguments it accepts. | / |
+| load_kwargs | str | Keyword arguments to pass to the `.load()` method of the loader. Check the documentation ofthe loader to check which arguments it accepts. | / |
+| additional_requirements | list | Some loaders require additional dependencies to be installed. You can specify those here. Use a format accepted by `pip install`. Eg. "pypdf" or "pypdf==3.17.1". Unfortunately additional requirements for LlamaIndex loaders are not documented well, but if a dependencyis missing, a clear error message will be thrown. | / |
+| n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | / |
+| index_column | str | Column to set index to in the load component, if not specified a default globally unique index will be set | / |
+
+### Usage
+
+You can add this component to your pipeline using the following code:
+
+```python
+from fondant.pipeline import Pipeline
+
+
+pipeline = Pipeline(...)
+
+dataset = pipeline.read(
+    "load_with_llamahub",
+    arguments={
+        # Add arguments
+        # "loader_class": ,
+        # "loader_kwargs": ,
+        # "load_kwargs": ,
+        # "additional_requirements": [],
+        # "n_rows_to_load": 0,
+        # "index_column": ,
+    }
+)
+```
+
+### Testing
+
+You can run the tests using docker with BuildKit. From this directory, run:
+```
+docker build . --target test
+```
diff --git a/components/load_with_llamahub/fondant_component.yaml b/components/load_with_llamahub/fondant_component.yaml
index b8da90ac2..ca16ff794 100644
--- a/components/load_with_llamahub/fondant_component.yaml
+++ b/components/load_with_llamahub/fondant_component.yaml
@@ -2,12 +2,12 @@ name: Load with LlamaHub
 description: |
   Load data using a LlamaHub loader. For available loaders, check the 
   [LlamaHub](https://llamahub.ai/).
-image: ghcr.io/ml6team/load_with_llamahub:dev
+image: fndnt/load_with_llamahub:dev
+tags:
+  - Data loading
 
 produces:
-  text:
-    type: string
-  #TODO: Add extra fields to extract from document metadata
+  additionalProperties: true
 
 args:
   loader_class:
diff --git a/components/load_with_llamahub/requirements.txt b/components/load_with_llamahub/requirements.txt
index 8356b3e35..3a7971f8f 100644
--- a/components/load_with_llamahub/requirements.txt
+++ b/components/load_with_llamahub/requirements.txt
@@ -1,2 +1 @@
-fondant[component]==0.8.dev2
 llama-index==0.9.9

From de42e9ee2a4e3ee3f44aed0321c76fe7efd073d5 Mon Sep 17 00:00:00 2001
From: Robbe Sneyders <robbe.sneyders@ml6.eu>
Date: Tue, 12 Dec 2023 14:23:20 +0100
Subject: [PATCH 3/4] Limit readme pre-commit to top level component specs

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6712e4330..f53ae280e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -68,5 +68,5 @@ repos:
         name: Generate component READMEs
         language: python
         entry: python scripts/component_readme/generate_readme.py
-        files: ^components/.*/fondant_component.yaml
+        files: ^components/[^/]*/fondant_component.yaml
         additional_dependencies: ["fondant@git+https://github.com/ml6team/fondant@main", "Jinja2==3.1.2"]
\ No newline at end of file

From 1ed821b13fa820965916e6536e6abdebafa3c92a Mon Sep 17 00:00:00 2001
From: Robbe Sneyders <robbe.sneyders@ml6.eu>
Date: Tue, 12 Dec 2023 12:13:45 +0100
Subject: [PATCH 4/4] Update hub links in documentation after adding new
 components

---
 .../download_images/fondant_component.yaml    |  2 +-
 components/load_from_csv/README.md            |  2 +-
 .../load_from_csv/fondant_component.yaml      |  2 +-
 components/load_from_hf_hub/README.md         |  2 +-
 .../load_from_hf_hub/fondant_component.yaml   |  2 +-
 components/write_to_hf_hub/README.md          |  2 +-
 .../write_to_hf_hub/fondant_component.yaml    |  2 +-
 docs/components/hub.md                        | 28 +++++++++++++++----
 docs/guides/build_a_simple_pipeline.md        | 25 +++++++++--------
 docs/guides/implement_custom_components.md    |  2 +-
 10 files changed, 44 insertions(+), 25 deletions(-)

diff --git a/components/download_images/fondant_component.yaml b/components/download_images/fondant_component.yaml
index 7a230a527..47b178063 100644
--- a/components/download_images/fondant_component.yaml
+++ b/components/download_images/fondant_component.yaml
@@ -10,7 +10,7 @@ description: |
 
 image: fndnt/download_images:dev
 tags:
-  - Image processing
+  - Data retrieval
 
 consumes:
   image_url:
diff --git a/components/load_from_csv/README.md b/components/load_from_csv/README.md
index b444628f7..ce52647ee 100644
--- a/components/load_from_csv/README.md
+++ b/components/load_from_csv/README.md
@@ -1,4 +1,4 @@
-# Load from csv file
+# Load from csv
 
 ### Description
 Component that loads a dataset from a csv file
diff --git a/components/load_from_csv/fondant_component.yaml b/components/load_from_csv/fondant_component.yaml
index 343706ec4..4c27c6d6a 100644
--- a/components/load_from_csv/fondant_component.yaml
+++ b/components/load_from_csv/fondant_component.yaml
@@ -1,4 +1,4 @@
-name: Load from csv file
+name: Load from csv
 description: Component that loads a dataset from a csv file
 image: fndnt/load_from_csv:dev
 tags:
diff --git a/components/load_from_hf_hub/README.md b/components/load_from_hf_hub/README.md
index 62b28af83..2fe9a8f6c 100644
--- a/components/load_from_hf_hub/README.md
+++ b/components/load_from_hf_hub/README.md
@@ -1,4 +1,4 @@
-# Load from hub
+# Load from Hugging Face hub
 
 ### Description
 Component that loads a dataset from the hub
diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
index f0e1bcdb4..19fc612c8 100644
--- a/components/load_from_hf_hub/fondant_component.yaml
+++ b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,4 +1,4 @@
-name: Load from hub
+name: Load from Hugging Face hub
 description: Component that loads a dataset from the hub
 image: fndnt/load_from_hf_hub:dev
 tags:
diff --git a/components/write_to_hf_hub/README.md b/components/write_to_hf_hub/README.md
index 5197cbd32..05268b782 100644
--- a/components/write_to_hf_hub/README.md
+++ b/components/write_to_hf_hub/README.md
@@ -1,4 +1,4 @@
-# Write to hub
+# Write to Hugging Face hub
 
 ### Description
 Component that writes a dataset to the hub
diff --git a/components/write_to_hf_hub/fondant_component.yaml b/components/write_to_hf_hub/fondant_component.yaml
index 74169d483..61af7e0ef 100644
--- a/components/write_to_hf_hub/fondant_component.yaml
+++ b/components/write_to_hf_hub/fondant_component.yaml
@@ -1,4 +1,4 @@
-name: Write to hub
+name: Write to Hugging Face hub
 description: Component that writes a dataset to the hub
 image: fndnt/write_to_hf_hub:dev
 tags:
diff --git a/docs/components/hub.md b/docs/components/hub.md
index 40f04ec2e..54fd562cc 100644
--- a/docs/components/hub.md
+++ b/docs/components/hub.md
@@ -8,11 +8,15 @@ Below you can find the reusable components offered by Fondant.
 
 **Data loading**
 
+??? "Load from csv"
+
+    --8<-- "components/load_from_csv/README.md:1"
+
 ??? "Load from files"
 
     --8<-- "components/load_from_files/README.md:1"
 
-??? "Load from hub"
+??? "Load from Hugging Face hub"
 
     --8<-- "components/load_from_hf_hub/README.md:1"
 
@@ -20,8 +24,20 @@ Below you can find the reusable components offered by Fondant.
 
     --8<-- "components/load_from_parquet/README.md:1"
 
+??? "Load with LlamaHub"
+
+    --8<-- "components/load_with_llamahub/README.md:1"
+
 **Data retrieval**
 
+??? "Download images"
+
+    --8<-- "components/download_images/README.md:1"
+
+??? "retrieve_from_weaviate"
+
+    --8<-- "components/retrieve_from_weaviate/README.md:1"
+
 ??? "Embedding based LAION retrieval"
 
     --8<-- "components/retrieve_laion_by_embedding/README.md:1"
@@ -40,7 +56,7 @@ Below you can find the reusable components offered by Fondant.
 
     --8<-- "components/index_weaviate/README.md:1"
 
-??? "Write to hub"
+??? "Write to Hugging Face hub"
 
     --8<-- "components/write_to_hf_hub/README.md:1"
 
@@ -54,10 +70,6 @@ Below you can find the reusable components offered by Fondant.
 
     --8<-- "components/crop_images/README.md:1"
 
-??? "Download images"
-
-    --8<-- "components/download_images/README.md:1"
-
 ??? "Embed images"
 
     --8<-- "components/embed_images/README.md:1"
@@ -88,6 +100,10 @@ Below you can find the reusable components offered by Fondant.
 
     --8<-- "components/embed_text/README.md:1"
 
+??? "retriever_eval_ragas"
+
+    --8<-- "components/evaluate_ragas/README.md:1"
+
 ??? "Filter languages"
 
     --8<-- "components/filter_language/README.md:1"
diff --git a/docs/guides/build_a_simple_pipeline.md b/docs/guides/build_a_simple_pipeline.md
index d424e8c02..2fef3eb7f 100644
--- a/docs/guides/build_a_simple_pipeline.md
+++ b/docs/guides/build_a_simple_pipeline.md
@@ -8,14 +8,17 @@ the Fondant hub.
 ## Overview
 
 In this guide, we will build a pipeline that downloads images from the 
-[fondant-cc-25m](https://huggingface.co/datasets/fondant-ai/fondant-cc-25m) dataset.
+[fondant-cc-25m](https://huggingface.co/datasets/fondant-ai/fondant-cc-25m) dataset and filters 
+them.
 
-It consists of two steps:
+It consists of three steps:
 
-* **[load_from_hf_hub](https://github.com/ml6team/fondant/tree/main/components/load_from_hf_hub)**: 
+* **[load_from_hf_hub](../components/hub.md#description_2)**: 
   Loads the dataset containing image urls from the Huggingface hub.
-* **[download_images](https://github.com/ml6team/fondant/tree/main/components/download_images)**:
+* **[download_images](../components/hub.md#description_5)**:
   Downloads images from the image urls. 
+* **[filter_language](../components/hub.md#description_22)**:
+  Filters the images based on the alt text language
 
 ## Setting up the environment
 
@@ -64,7 +67,7 @@ If you want to learn more about components, you can check out the
 
 As a first step, we want to read data into our pipeline. In this case, we will load a dataset 
 from the HuggingFace Hub. For this, we can use the reusable 
-[load_from_hf_hub](../components/hub.md#description_1) component.
+[load_from_hf_hub](../components/hub.md#description_2) component.
 
 We can read data into our pipeline using the `Pipeline.read()` method, which returns a (lazy) 
 `Dataset`.
@@ -92,10 +95,10 @@ We provide three arguments to the `.read()` method:
 
 - The name of the reusable component
 - Some arguments to configure the component. Check the component's 
-  [documentation](../components/hub.md#arguments_1) for the supported arguments
+  [documentation](../components/hub.md#arguments_2) for the supported arguments
 - The schema of the data the component will produce. This is necessary for this specific 
   component since the output is dynamic based on the dataset being loaded. You can see this 
-  defined in the component [documentation](../components/hub.md#inputs-outputs_1) with 
+  defined in the component [documentation](../components/hub.md#inputs-outputs_2) with 
   `additionalProperties: true` under the produces section.
 
 ??? "View a detailed reference of the `Pipeline.read()` method"
@@ -128,7 +131,7 @@ directly, we must download each of them.
 
 Downloading images is a common requirement across various use cases, which is why Fondant provides 
 a reusable component specifically for this purpose. This component is appropriately named 
-[download_images](../components/hub.md#description_10).
+[download_images](../components/hub.md#description_5).
 
 We can add this component to our pipeline as follows:
 
@@ -138,7 +141,7 @@ images = dataset.apply(
 )
 ```
 
-Looking at the component [documentation](../components/hub.md#inputs-outputs_1), we can see that 
+Looking at the component [documentation](../components/hub.md#inputs-outputs_5), we can see that 
 it expects an `"image_url"` field, which was generated by our previous component. This means 
 that we can simply chain the components as-is.
 
@@ -146,8 +149,8 @@ that we can simply chain the components as-is.
 
 This won't always be the case though. We now want to filter our dataset for images that contain 
 English alt text. For this, we leverage the 
-[filter_language](../components/hub.md#description_18) component. Looking at the component 
-[documentation](../components/hub.md#inputs-outputs_18), we can see that it expects an `"text"` 
+[filter_language](../components/hub.md#description_22) component. Looking at the component 
+[documentation](../components/hub.md#inputs-outputs_22), we can see that it expects an `"text"` 
 field, while we would like to apply it to the `"alt_text"` field in our dataset.
 
 We can easily achieve this using the `consumes` argument, which lets us maps the fields that the 
diff --git a/docs/guides/implement_custom_components.md b/docs/guides/implement_custom_components.md
index 28346f656..e7a7fb869 100644
--- a/docs/guides/implement_custom_components.md
+++ b/docs/guides/implement_custom_components.md
@@ -245,5 +245,5 @@ We now have a pipeline that downloads a dataset from the HuggingFace hub, filter
 image type, downloads the images, and filters them by alt text language.
 
 One final step still remaining, is to write teh final dataset to its destination. You could for 
-instance use the [`write_to_hf_hub`](../components/hub.md#description_7) component to write it to 
+instance use the [`write_to_hf_hub`](../components/hub.md#description_11) component to write it to 
 the HuggingFace Hub, or create a custom `WriteComponent`.