-
Notifications
You must be signed in to change notification settings - Fork 25
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[LLM pipeline] Add filter out short texts component (#247)
This component filter out short text passages. Text length can be passed as argument. This component is needed for the LLM dataset creation pipeline. --------- Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Co-authored-by: Robbe Sneyders <robbe.sneyders@gmail.com>
- Loading branch information
1 parent
879da0b
commit 7f6fd89
Showing
5 changed files
with
118 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
FROM --platform=linux/amd64 python:3.8-slim | ||
|
||
# System dependencies | ||
RUN apt-get update && \ | ||
apt-get upgrade -y && \ | ||
apt-get install git -y | ||
|
||
# Install requirements | ||
COPY requirements.txt / | ||
RUN pip3 install --no-cache-dir -r requirements.txt | ||
|
||
# Install Fondant | ||
# This is split from other requirements to leverage caching | ||
ARG FONDANT_VERSION=main | ||
RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} | ||
|
||
# Set the working directory to the component folder | ||
WORKDIR /component/src | ||
|
||
# Copy over src-files | ||
COPY src/ . | ||
|
||
ENTRYPOINT ["python", "main.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
name: Filter text length | ||
description: A component that filters out text based on their length | ||
image: ghcr.io/ml6team/filter_text_length:latest | ||
|
||
consumes: | ||
text: | ||
fields: | ||
data: | ||
type: string | ||
|
||
args: | ||
min_characters_length: | ||
description: Minimum number of characters | ||
type: int | ||
min_words_length: | ||
description: Mininum number of words | ||
type: int |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
pyarrow>=7.0 | ||
fasttext-wheel==0.9.2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
"""A component that filters out text based on their length.""" | ||
import logging | ||
|
||
import fasttext | ||
import pandas as pd | ||
from fondant.component import PandasTransformComponent | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class TextLengthFilterComponent(PandasTransformComponent): | ||
"""A component that filters out text based on their length.""" | ||
|
||
def setup(self, *, min_characters_length: int, min_words_length: int): | ||
"""Setup component. | ||
Args: | ||
min_characters_length: minimum number of characters | ||
min_words_length: minimum number of words. | ||
""" | ||
self.min_characters_length = min_characters_length | ||
self.min_words_length = min_words_length | ||
|
||
def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Filter out text based on their length. | ||
Args: | ||
dataframe: Pandas dataframe. | ||
Returns: | ||
Pandas dataframe. | ||
""" | ||
caption_num_words = dataframe["text"]["data"].apply(lambda x: len(fasttext.tokenize(x))) | ||
caption_num_chars = dataframe["text"]["data"].apply(len) | ||
|
||
mask = (caption_num_words >= self.min_words_length) & \ | ||
(caption_num_chars >= self.min_characters_length) | ||
dataframe = dataframe[mask] | ||
return dataframe | ||
|
||
|
||
if __name__ == "__main__": | ||
component = TextLengthFilterComponent.from_args() | ||
component.run() |
31 changes: 31 additions & 0 deletions
31
components/text_length_filter/tests/text_length_filter_test.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
"""Unit test for text length filter component.""" | ||
import pandas as pd | ||
from fondant.component_spec import ComponentSpec | ||
|
||
from components.text_length_filter.src.main import TextLengthFilterComponent | ||
|
||
|
||
def test_run_component_test(): | ||
"""Test text length filter component.""" | ||
# Given: Dataframe with text with different lengths | ||
data = [{"data": "To less words"}, | ||
{"data": "Still to less chars"}, | ||
{"data": "This a valid sentence which should be still there"}] | ||
|
||
dataframe = pd.concat({"text": pd.DataFrame(data)}, axis=1, names=["text", "data"]) | ||
|
||
# When: The text filter component proceed the dataframe | ||
spec = ComponentSpec.from_file("../fondant_component.yaml") | ||
|
||
component = TextLengthFilterComponent(spec, input_manifest_path="./dummy_input_manifest.json", | ||
output_manifest_path="./dummy_input_manifest.json", | ||
metadata={}, | ||
user_arguments={"min_characters_length": 20, | ||
"min_words_length": 4}, | ||
) | ||
component.setup(min_characters_length=20, min_words_length=4) | ||
dataframe = component.transform(dataframe=dataframe) | ||
|
||
# Then: dataframe only contains one row | ||
assert len(dataframe) == 1 | ||
assert dataframe.loc[2]["text"]["data"] == "This a valid sentence which should be still there" |