Skip to content

Commit

Permalink
Merge pull request #14 from projecte-aina/lingua_franca
Browse files Browse the repository at this point in the history
Lingua_franca addition for parsing written hours
  • Loading branch information
gullabi authored Feb 23, 2024
2 parents ca65154 + ae2bded commit f47520d
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 0 deletions.
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ RUN pip install --upgrade pip && \
make && \
make install

RUN pip install git+https://github.com/MycroftAI/lingua-franca.git@5bfd75fe5996fd364102a0eec3f714c9ddc9275c

WORKDIR /app
COPY ./requirements.txt /app
Expand Down
33 changes: 33 additions & 0 deletions server/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@
import sys
import traceback
import tempfile
import datetime
import re

# Lingua franca
from lingua_franca import load_language
from lingua_franca.format import nice_time
from lingua_franca.time import default_timezone

# Libraries for multiprocessing
import multiprocessing as mp
Expand Down Expand Up @@ -59,6 +66,9 @@
# Initialize sentence segmenter
segmenter = Segmenter(language="en")

# Load lingua franca language
load_language('ca-es')

def create_argparser():
def convert_boolean(x):
return x.lower() in ["true", "1", "yes"]
Expand Down Expand Up @@ -318,6 +328,29 @@ async def details(request: Request):


def worker(sentence, speaker_id, model, use_aliases, new_speaker_ids):
def substitute_time(sentence):
# Regular expression to find time pattern (HH:MM)
time_pattern = re.compile(r'((?<=\s)\d{1,2}):(\d{2}(?=\s))')

# Find all matches of time pattern in the sentence
matches = re.findall(time_pattern, sentence)

if not matches:
return sentence

sentence = re.sub(r'les\s+', '', sentence, count=1)

# Iterate through matches and substitute with formatted time string
for match in matches:
H = int(match[0])
M = int(match[1])
dt = datetime.datetime(2017, 1, 31, H, M, 0, tzinfo=default_timezone()) # Using UTC timezone for simplicity
formatted_time = nice_time(dt, lang="ca", use_24hour=True) # Assuming you have a function to format time in Catalan
sentence = sentence.replace(f'{match[0]}:{match[1]}', formatted_time)

return sentence

sentence = substitute_time(sentence)

print(" > Model input: {}".format(sentence))
print(" > Speaker Idx: {}".format(speaker_id))
Expand Down

0 comments on commit f47520d

Please sign in to comment.