Skip to content

Commit

Permalink
[LLM pipeline] Add normalize text component (#246)
Browse files Browse the repository at this point in the history
Component which applies different text normalization (nfc, lowercasing
and regex pattern replacements)

This component is needed for the LLM dataset creation pipeline.
  • Loading branch information
mrchtr authored Jul 5, 2023
1 parent 2272f17 commit 7083693
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 0 deletions.
18 changes: 18 additions & 0 deletions components/text_normalization/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
FROM --platform=linux/amd64 python:3.8-slim

## System dependencies
RUN apt-get update && \
apt-get upgrade -y && \
apt-get install git -y

# install requirements
COPY requirements.txt /
RUN pip3 install --no-cache-dir -r requirements.txt

# Set the working directory to the component folder
WORKDIR /component/src

# Copy over src-files
COPY src/ .

ENTRYPOINT ["python", "main.py"]
20 changes: 20 additions & 0 deletions components/text_normalization/fondant_component.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
name: Normalize text.
description: A component that normalizes text.
image: ghcr.io/ml6team/text_normalization:latest

consumes:
text:
fields:
data:
type: string

args:
apply_nfc:
description: If true apply nfc normalization
type: bool
do_lowercase:
description: If true apply lowercasing
type: bool
characters_to_remove:
description: List of characters which will be removed, e.g. [?,.!,@#%]
type: list
3 changes: 3 additions & 0 deletions components/text_normalization/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
git+https://github.com/ml6team/fondant.git@main
pyarrow>=7.0
gcsfs==2023.4.00
65 changes: 65 additions & 0 deletions components/text_normalization/src/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""A component that normalizes text."""
import logging
import re
import unicodedata
from typing import List

import pandas as pd

from fondant.component import PandasTransformComponent

logger = logging.getLogger(__name__)


class TextNormalizationComponent(PandasTransformComponent):
"""Component that normalizes text."""

def setup(self, *, apply_nfc: bool, do_lowercase: bool, characters_to_remove: List[str]):
self.apply_nfc = apply_nfc
self.do_lowercase = do_lowercase
self.characters_to_remove = characters_to_remove

@staticmethod
def _do_nfc_normalization(text: str):
"""Apply nfc normalization to the text of the dataframe."""
return unicodedata.normalize("NFC", text)

@staticmethod
def _remove_patterns(regex_patterns: List[str], text: str):
"""Remove each regex pattern in the provided string."""
for pattern in regex_patterns:
text = re.sub(pattern, "", text)
return text

def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
"""
Apply normalization transformations. The component is capable of:
- NFC normalization
- Lowercasing
- Removing of regex patterns.
Args:
dataframe: Pandas dataframe.
Returns:
Pandas dataframe
"""
if self.apply_nfc:
dataframe["text"]["data"].apply(lambda x: self._do_nfc_normalization(x))

if self.do_lowercase:
dataframe["text"]["data"].apply(lambda x: x.lower())

if len(self.characters_to_remove) > 0:
dataframe["text"]["data"].apply(
lambda x: self._remove_patterns(
self.characters_to_remove, x,
),
)

return dataframe


if __name__ == "__main__":
component = TextNormalizationComponent.from_args()
component.run()

0 comments on commit 7083693

Please sign in to comment.