[LLM pipeline] Add normalize text component (#246)

Component which applies different text normalization (nfc, lowercasing and regex pattern replacements) This component is needed for the LLM dataset creation pipeline.
ml6team · Jul 5, 2023 · 7083693 · 7083693
1 parent 2272f17
commit 7083693
Show file tree

Hide file tree

Showing 4 changed files with 106 additions and 0 deletions.
diff --git a/components/text_normalization/Dockerfile b/components/text_normalization/Dockerfile
@@ -0,0 +1,18 @@
+FROM --platform=linux/amd64 python:3.8-slim
+
+## System dependencies
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install git -y
+
+# install requirements
+COPY requirements.txt /
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# Set the working directory to the component folder
+WORKDIR /component/src
+
+# Copy over src-files
+COPY src/ .
+
+ENTRYPOINT ["python", "main.py"]
diff --git a/components/text_normalization/fondant_component.yaml b/components/text_normalization/fondant_component.yaml
@@ -0,0 +1,20 @@
+name: Normalize text.
+description: A component that normalizes text.
+image: ghcr.io/ml6team/text_normalization:latest
+
+consumes:
+  text:
+    fields:
+      data:
+        type: string
+
+args:
+  apply_nfc:
+    description: If true apply nfc normalization
+    type: bool
+  do_lowercase:
+    description: If true apply lowercasing
+    type: bool
+  characters_to_remove:
+    description: List of characters which will be removed, e.g. [?,.!,@#%]
+    type: list
diff --git a/components/text_normalization/requirements.txt b/components/text_normalization/requirements.txt
@@ -0,0 +1,3 @@
+git+https://github.com/ml6team/fondant.git@main
+pyarrow>=7.0
+gcsfs==2023.4.00
diff --git a/components/text_normalization/src/main.py b/components/text_normalization/src/main.py
@@ -0,0 +1,65 @@
+"""A component that normalizes text."""
+import logging
+import re
+import unicodedata
+from typing import List
+
+import pandas as pd
+
+from fondant.component import PandasTransformComponent
+
+logger = logging.getLogger(__name__)
+
+
+class TextNormalizationComponent(PandasTransformComponent):
+    """Component that normalizes text."""
+
+    def setup(self, *, apply_nfc: bool, do_lowercase: bool, characters_to_remove: List[str]):
+        self.apply_nfc = apply_nfc
+        self.do_lowercase = do_lowercase
+        self.characters_to_remove = characters_to_remove
+
+    @staticmethod
+    def _do_nfc_normalization(text: str):
+        """Apply nfc normalization to the text of the dataframe."""
+        return unicodedata.normalize("NFC", text)
+
+    @staticmethod
+    def _remove_patterns(regex_patterns: List[str], text: str):
+        """Remove each regex pattern in the provided string."""
+        for pattern in regex_patterns:
+            text = re.sub(pattern, "", text)
+        return text
+
+    def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
+        """
+        Apply normalization transformations. The component is capable of:
+        - NFC normalization
+        - Lowercasing
+        - Removing of regex patterns.
+
+        Args:
+            dataframe: Pandas dataframe.
+
+        Returns:
+            Pandas dataframe
+        """
+        if self.apply_nfc:
+            dataframe["text"]["data"].apply(lambda x: self._do_nfc_normalization(x))
+
+        if self.do_lowercase:
+            dataframe["text"]["data"].apply(lambda x: x.lower())
+
+        if len(self.characters_to_remove) > 0:
+            dataframe["text"]["data"].apply(
+                lambda x: self._remove_patterns(
+                    self.characters_to_remove, x,
+                ),
+            )
+
+        return dataframe
+
+
+if __name__ == "__main__":
+    component = TextNormalizationComponent.from_args()
+    component.run()