[LLM pipeline] Add filter out short texts component (#247)

This component filter out short text passages. Text length can be passed as argument. This component is needed for the LLM dataset creation pipeline. --------- Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Co-authored-by: Robbe Sneyders <robbe.sneyders@gmail.com>
ml6team · Jul 11, 2023 · 7f6fd89 · 7f6fd89
1 parent 879da0b
commit 7f6fd89
Show file tree

Hide file tree

Showing 5 changed files with 118 additions and 0 deletions.
diff --git a/components/text_length_filter/Dockerfile b/components/text_length_filter/Dockerfile
@@ -0,0 +1,23 @@
+FROM --platform=linux/amd64 python:3.8-slim
+
+# System dependencies
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install git -y
+
+# Install requirements
+COPY requirements.txt /
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# Install Fondant
+# This is split from other requirements to leverage caching
+ARG FONDANT_VERSION=main
+RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
+
+# Set the working directory to the component folder
+WORKDIR /component/src
+
+# Copy over src-files
+COPY src/ .
+
+ENTRYPOINT ["python", "main.py"]
diff --git a/components/text_length_filter/fondant_component.yaml b/components/text_length_filter/fondant_component.yaml
@@ -0,0 +1,17 @@
+name: Filter text length
+description: A component that filters out text based on their length
+image: ghcr.io/ml6team/filter_text_length:latest
+
+consumes:
+  text:
+    fields:
+      data:
+        type: string
+
+args:
+  min_characters_length:
+    description: Minimum number of characters
+    type: int
+  min_words_length:
+    description: Mininum number of words
+    type: int
diff --git a/components/text_length_filter/requirements.txt b/components/text_length_filter/requirements.txt
@@ -0,0 +1,2 @@
+pyarrow>=7.0
+fasttext-wheel==0.9.2
diff --git a/components/text_length_filter/src/main.py b/components/text_length_filter/src/main.py
@@ -0,0 +1,45 @@
+"""A component that filters out text based on their length."""
+import logging
+
+import fasttext
+import pandas as pd
+from fondant.component import PandasTransformComponent
+
+logger = logging.getLogger(__name__)
+
+
+class TextLengthFilterComponent(PandasTransformComponent):
+    """A component that filters out text based on their length."""
+
+    def setup(self, *, min_characters_length: int, min_words_length: int):
+        """Setup component.
+
+        Args:
+            min_characters_length: minimum number of characters
+            min_words_length: minimum number of words.
+        """
+        self.min_characters_length = min_characters_length
+        self.min_words_length = min_words_length
+
+    def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
+        """
+        Filter out text based on their length.
+
+        Args:
+            dataframe: Pandas dataframe.
+
+        Returns:
+            Pandas dataframe.
+        """
+        caption_num_words = dataframe["text"]["data"].apply(lambda x: len(fasttext.tokenize(x)))
+        caption_num_chars = dataframe["text"]["data"].apply(len)
+
+        mask = (caption_num_words >= self.min_words_length) & \
+               (caption_num_chars >= self.min_characters_length)
+        dataframe = dataframe[mask]
+        return dataframe
+
+
+if __name__ == "__main__":
+    component = TextLengthFilterComponent.from_args()
+    component.run()
diff --git a/components/text_length_filter/tests/text_length_filter_test.py b/components/text_length_filter/tests/text_length_filter_test.py
@@ -0,0 +1,31 @@
+"""Unit test for text length filter component."""
+import pandas as pd
+from fondant.component_spec import ComponentSpec
+
+from components.text_length_filter.src.main import TextLengthFilterComponent
+
+
+def test_run_component_test():
+    """Test text length filter component."""
+    # Given: Dataframe with text with different lengths
+    data = [{"data": "To less words"},
+            {"data": "Still to less chars"},
+            {"data": "This a valid sentence which should be still there"}]
+
+    dataframe = pd.concat({"text": pd.DataFrame(data)}, axis=1, names=["text", "data"])
+
+    # When: The text filter component proceed the dataframe
+    spec = ComponentSpec.from_file("../fondant_component.yaml")
+
+    component = TextLengthFilterComponent(spec, input_manifest_path="./dummy_input_manifest.json",
+                                        output_manifest_path="./dummy_input_manifest.json",
+                                        metadata={},
+                                        user_arguments={"min_characters_length": 20,
+                                                        "min_words_length": 4},
+                                        )
+    component.setup(min_characters_length=20, min_words_length=4)
+    dataframe = component.transform(dataframe=dataframe)
+
+    # Then: dataframe only contains one row
+    assert len(dataframe) == 1
+    assert dataframe.loc[2]["text"]["data"] == "This a valid sentence which should be still there"