-
Notifications
You must be signed in to change notification settings - Fork 42
/
translation.py
115 lines (88 loc) · 3.47 KB
/
translation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
"""Text translation using Argos Translate.
The node takes text input and translates it to English. The text may contain any
number of language directives in the form `lang:xx` where `xx` is a two-letter
language code. Text fragments after a language directives are translated.
If the language is `en` text is passed through unmodified.
"""
from __future__ import annotations
import re
from functools import cache
from typing import NamedTuple
@cache
def available_languages():
try:
from argostranslate.package import update_package_index, get_available_packages
update_package_index()
list = get_available_packages()
return [(l.from_code, l.from_name) for l in list if l.to_code == "en"]
except ImportError:
return [("NOT INSTALLED", "NOT INSTALLED")]
def translate_chunk(text: str, language: str):
if text.strip() == "":
return text
target = "en"
if language == target:
return text
try:
from argostranslate.package import get_installed_packages, get_available_packages
from argostranslate.translate import translate
installed = get_installed_packages()
if not any(p.from_code == language and p.to_code == target for p in installed):
available = get_available_packages()
pkg = next(
(p for p in available if p.from_code == language and p.to_code == target), None
)
assert pkg, f"Couldn't find package for translation from {language}"
print("Downloading and installing translation package", pkg)
pkg.install()
text, embeddings = _extract_embeddings(text)
translation = translate(text, language, target)
return embeddings + translation
except ImportError:
raise ImportError(
"Argos Translate is not installed. Please install it with `pip install argostranslate`"
)
def translate(text: str):
chunks = Chunk.parse(text)
return " ".join(translate_chunk(c.text, c.lang) for c in chunks)
class Translate:
@staticmethod
def INPUT_TYPES():
return {"required": {"text": ("STRING", {"multiline": True})}}
CATEGORY = "external_tooling"
RETURN_TYPES = ("STRING",)
FUNCTION = "translate"
def translate(self, text: str):
return (translate(text),)
_lang_regex = re.compile(r"(lang:\w\w)")
class Chunk(NamedTuple):
text: str
lang: str
@staticmethod
def parse(text: str):
languages = [code for code, name in available_languages()] + ["en"]
chunks: list[Chunk] = []
lang = "en"
last = 0
for m in _lang_regex.finditer(text):
if m.start() > 0:
chunks.append(Chunk(text[last : m.start()].strip(), lang))
last = m.end()
lang = m.group(0)[5:]
if lang not in languages:
raise ValueError(
f"Invalid language directive {m.group(0)} - {lang} is not a known language code."
f" Available languages: {', '.join(languages)}"
)
if last < len(text):
chunks.append(Chunk(text[last:].strip(), lang))
return [c for c in chunks if c.text != ""]
_embedding_regex = re.compile(r"(embedding:[^\s,]+)")
def _extract_embeddings(text: str):
matches = _embedding_regex.findall(text)
embeddings = " ".join(matches)
if matches:
embeddings += " "
for m in matches:
text = text.replace(m, "")
return text, embeddings