Merge pull request #14 from projecte-aina/lingua_franca

Lingua_franca addition for parsing written hours
projecte-aina · Feb 23, 2024 · f47520d · f47520d
2 parents ca65154 + ae2bded
commit f47520d
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 0 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -22,6 +22,7 @@ RUN pip install --upgrade pip && \
  make && \
  make install
 
+RUN pip install git+https://github.com/MycroftAI/lingua-franca.git@5bfd75fe5996fd364102a0eec3f714c9ddc9275c
 
 WORKDIR /app
 COPY ./requirements.txt /app

diff --git a/server/server.py b/server/server.py
@@ -7,6 +7,13 @@
 import sys
 import traceback
 import tempfile
+import datetime
+import re
+
+# Lingua franca
+from lingua_franca import load_language
+from lingua_franca.format import nice_time
+from lingua_franca.time import default_timezone
 
 # Libraries for multiprocessing
 import multiprocessing as mp
@@ -59,6 +66,9 @@
 # Initialize sentence segmenter
 segmenter = Segmenter(language="en")
 
+# Load lingua franca language
+load_language('ca-es')
+
 def create_argparser():
     def convert_boolean(x):
         return x.lower() in ["true", "1", "yes"]
@@ -318,6 +328,29 @@ async def details(request: Request):
 
 
 def worker(sentence, speaker_id, model, use_aliases, new_speaker_ids):
+    def substitute_time(sentence):
+        # Regular expression to find time pattern (HH:MM)
+        time_pattern = re.compile(r'((?<=\s)\d{1,2}):(\d{2}(?=\s))')
+
+        # Find all matches of time pattern in the sentence
+        matches = re.findall(time_pattern, sentence)
+
+        if not matches:
+            return sentence
+
+        sentence = re.sub(r'les\s+', '', sentence, count=1)
+
+        # Iterate through matches and substitute with formatted time string
+        for match in matches:
+            H = int(match[0])
+            M = int(match[1])
+            dt = datetime.datetime(2017, 1, 31, H, M, 0, tzinfo=default_timezone())  # Using UTC timezone for simplicity
+            formatted_time = nice_time(dt, lang="ca", use_24hour=True)  # Assuming you have a function to format time in Catalan
+            sentence = sentence.replace(f'{match[0]}:{match[1]}', formatted_time)
+
+        return sentence
+
+    sentence = substitute_time(sentence)
 
     print(" > Model input: {}".format(sentence))
     print(" > Speaker Idx: {}".format(speaker_id))