From 5b27c8afcb5a3d59a542c0f0ca1a48cec0694999 Mon Sep 17 00:00:00 2001 From: davidmezzetti <561939+davidmezzetti@users.noreply.github.com> Date: Fri, 25 Oct 2024 06:00:03 -0400 Subject: [PATCH] Apply #803 --- .github/workflows/build.yml | 2 +- setup.py | 2 +- src/python/txtai/pipeline/text/translation.py | 28 +++++++++++++++++-- src/python/txtai/version.py | 2 +- 4 files changed, 28 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 24e61b887..7dc9b5b70 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -47,7 +47,7 @@ jobs: - name: Build run: | pip install -U wheel - pip install .[all,dev] fasttext==0.9.2 + pip install .[all,dev] fasttext==0.9.2 pillow==10.4.0 python -c "import nltk; nltk.download(['punkt', 'punkt_tab', 'averaged_perceptron_tagger_eng'])" python --version make data coverage diff --git a/setup.py b/setup.py index d2d1839fd..c8a159b89 100644 --- a/setup.py +++ b/setup.py @@ -130,7 +130,7 @@ setup( name="txtai", - version="7.5.0", + version="7.5.1", author="NeuML", description="All-in-one open-source embeddings database for semantic search, LLM orchestration and language model workflows", long_description=DESCRIPTION, diff --git a/src/python/txtai/pipeline/text/translation.py b/src/python/txtai/pipeline/text/translation.py index 1b68b4849..d15e0a582 100644 --- a/src/python/txtai/pipeline/text/translation.py +++ b/src/python/txtai/pipeline/text/translation.py @@ -2,6 +2,8 @@ Translation module """ +import os + # Conditional import try: import fasttext @@ -10,7 +12,7 @@ except ImportError: FASTTEXT = False -from huggingface_hub import cached_download +from huggingface_hub import hf_hub_download from huggingface_hub.hf_api import HfApi from transformers import AutoModelForSeq2SeqLM, AutoTokenizer @@ -24,7 +26,7 @@ class Translation(HFModel): """ # Default language detection model - DEFAULT_LANG_DETECT = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz" + DEFAULT_LANG_DETECT = "julien-c/fasttext-language-id/lid.176.ftz" def __init__(self, path=None, quantize=False, gpu=True, batch=64, langdetect=None, findmodels=True): """ @@ -155,7 +157,7 @@ def defaultdetect(self, texts): path = self.langdetect if self.langdetect else Translation.DEFAULT_LANG_DETECT # Load language detection model - path = cached_download(path, legacy_cache_layout=True) + path = path if os.path.exists(path) else self.download(path) self.detector = fasttext.load_model(path) # Transform texts to format expected by language detection model @@ -163,6 +165,26 @@ def defaultdetect(self, texts): return [x[0].split("__")[-1] for x in self.detector.predict(texts)[0]] + def download(self, path): + """ + Downloads path from the Hugging Face Hub. + + Args: + path: full model path + + Returns: + local cached model path + """ + + # Split into parts + parts = path.split("/") + + # Calculate repo id split + repo = 2 if len(parts) > 2 else 1 + + # Download and cache file + return hf_hub_download(repo_id="/".join(parts[:repo]), filename="/".join(parts[repo:])) + def translate(self, texts, source, target, showmodels=False): """ Translates text from source to target language. diff --git a/src/python/txtai/version.py b/src/python/txtai/version.py index ebd1beebf..048ae24cc 100644 --- a/src/python/txtai/version.py +++ b/src/python/txtai/version.py @@ -3,4 +3,4 @@ """ # Current version tag -__version__ = "7.5.0" +__version__ = "7.5.1"