Use cleaner field names in reusable components (#679)

This PR cleans up the field names in the reusable components. They were just concatenated when first migrating away from the subsets. I tested all components with tests, fixed the outdated tests, and standardized the test directory structure. Each `tests` directory now has a `pytest.ini` so the `PYTHONPATH` is set correctly both inside and outside of docker, the `test_requirements.txt` was moved into the `tests` directory, and the `Dockerfile` was updated accordingly.
ml6team · Nov 28, 2023 · 197ac59 · 197ac59
1 parent 6a84677
commit 197ac59
Show file tree

Hide file tree

Showing 122 changed files with 305 additions and 298 deletions.
diff --git a/components/caption_images/Dockerfile b/components/caption_images/Dockerfile
@@ -17,12 +17,10 @@ RUN pip3 install fondant[component,aws,azure,gcp]@git+https://github.com/ml6team
 # Set the working directory to the component folder
 WORKDIR /component
 COPY src/ src/
-ENV PYTHONPATH "${PYTHONPATH}:./src"
 
 FROM base as test
-COPY test_requirements.txt .
-RUN pip3 install --no-cache-dir -r test_requirements.txt
 COPY tests/ tests/
+RUN pip3 install --no-cache-dir -r tests/requirements.txt
 RUN python -m pytest tests
 
 FROM base

diff --git a/components/caption_images/README.md b/components/caption_images/README.md
@@ -7,11 +7,11 @@ This component captions images using a BLIP model from the Hugging Face hub
 
 **This component consumes:**
 
-- images_data: binary
+- image: binary
 
 **This component produces:**
 
-- captions_text: string
+- caption: string
 
 ### Arguments
 

diff --git a/components/caption_images/fondant_component.yaml b/components/caption_images/fondant_component.yaml
@@ -5,11 +5,11 @@ tags:
   - Image processing
 
 consumes:
-  images_data:
+  image:
     type: binary
 
 produces:
-  captions_text:
+  caption:
     type: utf8
 
 args:

diff --git a/components/caption_images/src/main.py b/components/caption_images/src/main.py
@@ -90,7 +90,7 @@ def __init__(
         self.max_new_tokens = max_new_tokens
 
     def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
-        images = dataframe["images_data"]
+        images = dataframe["image"]
 
         results: t.List[pd.Series] = []
         for batch in np.split(
@@ -112,4 +112,4 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
                 ).T
                 results.append(captions)
 
-        return pd.concat(results).to_frame(name=("captions_text"))
+        return pd.concat(results).to_frame(name="caption")
diff --git a/...ng_based_laion_retrieval/tests/pytest.ini → components/caption_images/tests/pytest.ini b/...ng_based_laion_retrieval/tests/pytest.ini → components/caption_images/tests/pytest.ini
diff --git a/...ents/caption_images/test_requirements.txt → ...nts/caption_images/tests/requirements.txt b/...ents/caption_images/test_requirements.txt → ...nts/caption_images/tests/requirements.txt
diff --git a/components/caption_images/tests/test_caption_images.py b/components/caption_images/tests/test_caption_images.py
@@ -10,11 +10,11 @@ def test_image_caption_component():
         "https://cdn.pixabay.com/photo/2023/07/19/18/56/japanese-beetle-8137606_1280.png",
     ]
     input_dataframe = pd.DataFrame(
-        {"images": {"data": [requests.get(url).content for url in image_urls]}},
+        {"image": [requests.get(url).content for url in image_urls]},
     )
 
     expected_output_dataframe = pd.DataFrame(
-        data={("captions", "text"): {0: "a motorcycle", 1: "a beetle"}},
+        data={"caption": {0: "a motorcycle", 1: "a beetle"}},
     )
 
     component = CaptionImagesComponent(

diff --git a/components/chunk_text/Dockerfile b/components/chunk_text/Dockerfile
@@ -17,14 +17,12 @@ RUN pip3 install fondant[component,aws,azure,gcp]@git+https://github.com/ml6team
 # Set the working directory to the component folder
 WORKDIR /component
 COPY src/ src/
-ENV PYTHONPATH "${PYTHONPATH}:./src"
 
 FROM base as test
-COPY test_requirements.txt .
-RUN pip3 install --no-cache-dir -r test_requirements.txt
 COPY tests/ tests/
+RUN pip3 install --no-cache-dir -r tests/requirements.txt
 RUN python -m pytest tests
 
 FROM base
 WORKDIR /component/src
-ENTRYPOINT ["fondant", "execute", "main"]
+ENTRYPOINT ["fondant", "execute", "main"]
diff --git a/components/chunk_text/README.md b/components/chunk_text/README.md
@@ -11,12 +11,12 @@ consists of the id of the original document followed by the chunk index.
 
 **This component consumes:**
 
-- text_data: string
+- text: string
 
 **This component produces:**
 
-- text_data: string
-- text_original_document_id: string
+- text: string
+- original_document_id: string
 
 ### Arguments
 

diff --git a/components/chunk_text/fondant_component.yaml b/components/chunk_text/fondant_component.yaml
@@ -10,13 +10,13 @@ tags:
   - Text processing
 
 consumes:
-  text_data:
+  text:
     type: string
 
 produces:
-  text_data:
+  text:
     type: string
-  text_original_document_id:
+  original_document_id:
     type: string
 
 args:

diff --git a/components/chunk_text/src/main.py b/components/chunk_text/src/main.py
@@ -38,7 +38,7 @@ def __init__(
     def chunk_text(self, row) -> t.List[t.Tuple]:
         # Multi-index df has id under the name attribute
         doc_id = row.name
-        text_data = row[("text_data")]
+        text_data = row["text"]
         docs = self.text_splitter.create_documents([text_data])
         return [
             (doc_id, f"{doc_id}_{chunk_id}", chunk.page_content)
@@ -59,7 +59,7 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
         # Turn into dataframes
         results_df = pd.DataFrame(
             results,
-            columns=["text_original_document_id", "id", "text_data"],
+            columns=["original_document_id", "id", "text"],
         )
         results_df = results_df.set_index("id")
 

diff --git a/components/chunk_text/tests/chunk_text_test.py b/components/chunk_text/tests/chunk_text_test.py
@@ -7,7 +7,7 @@ def test_transform():
     """Test chunk component method."""
     input_dataframe = pd.DataFrame(
         {
-            ("text_data"): [
+            "text": [
                 "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo",
                 "ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis",
                 "parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec,",
@@ -25,8 +25,8 @@ def test_transform():
 
     expected_output_dataframe = pd.DataFrame(
         {
-            ("text_original_document_id"): ["a", "a", "a", "b", "b", "c", "c"],
-            ("text_data"): [
+            "original_document_id": ["a", "a", "a", "b", "b", "c", "c"],
+            "text": [
                 "Lorem ipsum dolor sit amet, consectetuer",
                 "amet, consectetuer adipiscing elit. Aenean",
                 "elit. Aenean commodo",

diff --git a/...pt_based_laion_retrieval/tests/pytest.ini → components/chunk_text/tests/pytest.ini b/...pt_based_laion_retrieval/tests/pytest.ini → components/chunk_text/tests/pytest.ini
diff --git a/components/chunk_text/test_requirements.txt → components/chunk_text/tests/requirements.txt b/components/chunk_text/test_requirements.txt → components/chunk_text/tests/requirements.txt
diff --git a/components/image_cropping/Dockerfile → components/crop_images/Dockerfile b/components/image_cropping/Dockerfile → components/crop_images/Dockerfile
diff --git a/components/image_cropping/README.md → components/crop_images/README.md b/components/image_cropping/README.md → components/crop_images/README.md
@@ -26,9 +26,9 @@ right side is border-cropped image.
 
 **This component produces:**
 
-- images_data: binary
-- images_width: int32
-- images_height: int32
+- image: binary
+- image_width: int32
+- image_height: int32
 
 ### Arguments
 
@@ -47,14 +47,14 @@ You can add this component to your pipeline using the following code:
 from fondant.pipeline import ComponentOp
 
 
-image_cropping_op = ComponentOp.from_registry(
-    name="image_cropping",
+crop_images_op = ComponentOp.from_registry(
+    name="crop_images",
     arguments={
         # Add arguments
         # "cropping_threshold": -30,
         # "padding": 10,
     }
 )
-pipeline.add_op(image_cropping_op, dependencies=[...])  #Add previous component as dependency
+pipeline.add_op(crop_images_op, dependencies=[...])  #Add previous component as dependency
 ```
 
diff --git a/...nts/image_cropping/fondant_component.yaml → ...onents/crop_images/fondant_component.yaml b/...nts/image_cropping/fondant_component.yaml → ...onents/crop_images/fondant_component.yaml
@@ -24,11 +24,11 @@ consumes:
     type: binary
 
 produces:
-  images_data:
+  image:
     type: binary
-  images_width:
+  image_width:
     type: int32
-  images_height:
+  image_height:
     type: int32
 
 args:

diff --git a/components/image_cropping/requirements.txt → components/crop_images/requirements.txt b/components/image_cropping/requirements.txt → components/crop_images/requirements.txt
diff --git a/components/image_cropping/src/image_crop.py → components/crop_images/src/image_crop.py b/components/image_cropping/src/image_crop.py → components/crop_images/src/image_crop.py
diff --git a/components/image_cropping/src/main.py → components/crop_images/src/main.py b/components/image_cropping/src/main.py → components/crop_images/src/main.py
@@ -46,12 +46,12 @@ def __init__(
 
     def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
         # crop images
-        dataframe["images_data"] = dataframe["images_data"].apply(
+        dataframe["image"] = dataframe["image"].apply(
             lambda image: remove_borders(image, self.cropping_threshold, self.padding),
         )
 
         # extract width and height
-        dataframe["images_width", "images_height"] = dataframe["images_data"].apply(
+        dataframe["image_width", "image_height"] = dataframe["image"].apply(
             extract_dimensions,
             axis=1,
             result_type="expand",

diff --git a/components/download_images/Dockerfile b/components/download_images/Dockerfile
@@ -17,14 +17,12 @@ RUN pip3 install fondant[component,aws,azure,gcp]@git+https://github.com/ml6team
 # Set the working directory to the component folder
 WORKDIR /component
 COPY src/ src/
-ENV PYTHONPATH "${PYTHONPATH}:./src"
 
 FROM base as test
-COPY test_requirements.txt .
-RUN pip3 install --no-cache-dir -r test_requirements.txt
 COPY tests/ tests/
+RUN pip3 install --no-cache-dir -r tests/requirements.txt
 RUN python -m pytest tests
 
 FROM base
 WORKDIR /component/src
-ENTRYPOINT ["fondant", "execute", "main"]
+ENTRYPOINT ["fondant", "execute", "main"]
diff --git a/components/download_images/README.md b/components/download_images/README.md
@@ -14,13 +14,13 @@ from the img2dataset library.
 
 **This component consumes:**
 
-- images_url: string
+- image_url: string
 
 **This component produces:**
 
-- images_data: binary
-- images_width: int32
-- images_height: int32
+- image: binary
+- image_width: int32
+- image_height: int32
 
 ### Arguments
 

diff --git a/components/download_images/fondant_component.yaml b/components/download_images/fondant_component.yaml
@@ -13,15 +13,15 @@ tags:
   - Image processing
 
 consumes:
-  images_url:
+  image_url:
     type: string
 
 produces:
-  images_data:
+  image:
     type: binary
-  images_width:
+  image_width:
     type: int32
-  images_height:
+  image_height:
     type: int32
 
 args:

diff --git a/components/download_images/src/main.py b/components/download_images/src/main.py
@@ -119,14 +119,14 @@ async def download_dataframe() -> None:
             images = await asyncio.gather(
                 *[
                     self.download_and_resize_image(id_, url, semaphore=semaphore)
-                    for id_, url in zip(dataframe.index, dataframe["images_url"])
+                    for id_, url in zip(dataframe.index, dataframe["image_url"])
                 ],
             )
             results.extend(images)
 
         asyncio.run(download_dataframe())
 
-        columns = ["id", "data", "width", "height"]
+        columns = ["id", "image", "image_width", "image_height"]
         if results:
             results_df = pd.DataFrame(results, columns=columns)
         else:

diff --git a/components/download_images/tests/pytest.ini b/components/download_images/tests/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+pythonpath = ../src
diff --git a/components/download_images/tests/requirements.txt b/components/download_images/tests/requirements.txt
@@ -0,0 +1,2 @@
+pytest==7.4.0
+respx==0.20.2
diff --git a/components/download_images/tests/test_component.py b/components/download_images/tests/test_component.py
@@ -45,7 +45,7 @@ def test_transform(respx_mock):
 
     input_dataframe = pd.DataFrame(
         {
-            "images_url": urls,
+            "image_url": urls,
         },
         index=pd.Index(ids, name="id"),
     )
@@ -55,9 +55,9 @@ def test_transform(respx_mock):
     resized_images = [component.resizer(io.BytesIO(image))[0] for image in images]
     expected_dataframe = pd.DataFrame(
         {
-            "images_data": resized_images,
-            "images_width": [image_size] * len(ids),
-            "images_height": [image_size] * len(ids),
+            "image": resized_images,
+            "image_width": [image_size] * len(ids),
+            "image_height": [image_size] * len(ids),
         },
         index=pd.Index(ids, name="id"),
     )

diff --git a/components/embed_images/README.md b/components/embed_images/README.md
@@ -7,11 +7,11 @@ Component that generates CLIP embeddings from images
 
 **This component consumes:**
 
-- images_data: binary
+- image: binary
 
 **This component produces:**
 
-- embeddings_data: list<item: float>
+- embedding: list<item: float>
 
 ### Arguments
 

diff --git a/components/embed_images/fondant_component.yaml b/components/embed_images/fondant_component.yaml
@@ -5,11 +5,11 @@ tags:
   - Image processing
 
 consumes:
-  images_data:
+  image:
     type: binary
 
 produces:
-  embeddings_data:
+  embedding:
     type: array
     items:
       type: float32

diff --git a/components/embed_images/src/main.py b/components/embed_images/src/main.py
@@ -90,7 +90,7 @@ def __init__(
         self.batch_size = batch_size
 
     def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
-        images = dataframe["images_data"]
+        images = dataframe["image"]
 
         results: t.List[pd.Series] = []
         for batch in np.split(
@@ -110,4 +110,4 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
                 ).T
                 results.append(embeddings)
 
-        return pd.concat(results).to_frame(name=("embeddings_data"))
+        return pd.concat(results).to_frame(name="embedding")
diff --git a/components/embed_text/Dockerfile b/components/embed_text/Dockerfile
@@ -17,14 +17,12 @@ RUN pip3 install fondant[component,aws,azure,gcp]@git+https://github.com/ml6team
 # Set the working directory to the component folder
 WORKDIR /component
 COPY src/ src/
-ENV PYTHONPATH "${PYTHONPATH}:./src"
 
 FROM base as test
-COPY test_requirements.txt .
-RUN pip3 install --no-cache-dir -r test_requirements.txt
 COPY tests/ tests/
+RUN pip3 install --no-cache-dir -r tests/requirements.txt
 RUN python -m pytest tests
 
 FROM base
 WORKDIR /component/src
-ENTRYPOINT ["fondant", "execute", "main"]
+ENTRYPOINT ["fondant", "execute", "main"]
diff --git a/components/embed_text/README.md b/components/embed_text/README.md
@@ -7,12 +7,11 @@ Component that generates embeddings of text passages.
 
 **This component consumes:**
 
-- text_data: string
+- text: string
 
 **This component produces:**
 
-- text_data: string
-- text_embedding: list<item: float>
+- embedding: list<item: float>
 
 ### Arguments