Skip to content

Commit

Permalink
[LLM pipeline] Add filter out short texts component (#247)
Browse files Browse the repository at this point in the history
This component filter out short text passages. Text length can be passed
as argument.

This component is needed for the LLM dataset creation pipeline.

---------

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Co-authored-by: Robbe Sneyders <robbe.sneyders@gmail.com>
  • Loading branch information
3 people authored Jul 11, 2023
1 parent 879da0b commit 7f6fd89
Show file tree
Hide file tree
Showing 5 changed files with 118 additions and 0 deletions.
23 changes: 23 additions & 0 deletions components/text_length_filter/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
FROM --platform=linux/amd64 python:3.8-slim

# System dependencies
RUN apt-get update && \
apt-get upgrade -y && \
apt-get install git -y

# Install requirements
COPY requirements.txt /
RUN pip3 install --no-cache-dir -r requirements.txt

# Install Fondant
# This is split from other requirements to leverage caching
ARG FONDANT_VERSION=main
RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}

# Set the working directory to the component folder
WORKDIR /component/src

# Copy over src-files
COPY src/ .

ENTRYPOINT ["python", "main.py"]
17 changes: 17 additions & 0 deletions components/text_length_filter/fondant_component.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
name: Filter text length
description: A component that filters out text based on their length
image: ghcr.io/ml6team/filter_text_length:latest

consumes:
text:
fields:
data:
type: string

args:
min_characters_length:
description: Minimum number of characters
type: int
min_words_length:
description: Mininum number of words
type: int
2 changes: 2 additions & 0 deletions components/text_length_filter/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pyarrow>=7.0
fasttext-wheel==0.9.2
45 changes: 45 additions & 0 deletions components/text_length_filter/src/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""A component that filters out text based on their length."""
import logging

import fasttext
import pandas as pd
from fondant.component import PandasTransformComponent

logger = logging.getLogger(__name__)


class TextLengthFilterComponent(PandasTransformComponent):
"""A component that filters out text based on their length."""

def setup(self, *, min_characters_length: int, min_words_length: int):
"""Setup component.
Args:
min_characters_length: minimum number of characters
min_words_length: minimum number of words.
"""
self.min_characters_length = min_characters_length
self.min_words_length = min_words_length

def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
"""
Filter out text based on their length.
Args:
dataframe: Pandas dataframe.
Returns:
Pandas dataframe.
"""
caption_num_words = dataframe["text"]["data"].apply(lambda x: len(fasttext.tokenize(x)))
caption_num_chars = dataframe["text"]["data"].apply(len)

mask = (caption_num_words >= self.min_words_length) & \
(caption_num_chars >= self.min_characters_length)
dataframe = dataframe[mask]
return dataframe


if __name__ == "__main__":
component = TextLengthFilterComponent.from_args()
component.run()
31 changes: 31 additions & 0 deletions components/text_length_filter/tests/text_length_filter_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""Unit test for text length filter component."""
import pandas as pd
from fondant.component_spec import ComponentSpec

from components.text_length_filter.src.main import TextLengthFilterComponent


def test_run_component_test():
"""Test text length filter component."""
# Given: Dataframe with text with different lengths
data = [{"data": "To less words"},
{"data": "Still to less chars"},
{"data": "This a valid sentence which should be still there"}]

dataframe = pd.concat({"text": pd.DataFrame(data)}, axis=1, names=["text", "data"])

# When: The text filter component proceed the dataframe
spec = ComponentSpec.from_file("../fondant_component.yaml")

component = TextLengthFilterComponent(spec, input_manifest_path="./dummy_input_manifest.json",
output_manifest_path="./dummy_input_manifest.json",
metadata={},
user_arguments={"min_characters_length": 20,
"min_words_length": 4},
)
component.setup(min_characters_length=20, min_words_length=4)
dataframe = component.transform(dataframe=dataframe)

# Then: dataframe only contains one row
assert len(dataframe) == 1
assert dataframe.loc[2]["text"]["data"] == "This a valid sentence which should be still there"

0 comments on commit 7f6fd89

Please sign in to comment.