Skip to content

Commit

Permalink
Merge pull request #19 from projecte-aina/matxa_migration
Browse files Browse the repository at this point in the history
Matxa migration
  • Loading branch information
PaulNdrei authored Sep 30, 2024
2 parents d0d6df9 + e4b51b8 commit 1412f52
Show file tree
Hide file tree
Showing 42 changed files with 1,365 additions and 955 deletions.
50 changes: 36 additions & 14 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1,15 +1,37 @@
**/.git
**/.gitignore
**/.vscode
**/coverage
**/.env
**/.ssh
Dockerfile
# Ignore the virtual environment directory
venv

# Ignore Python cache files
__pycache__

# Ignore environment and dependency files
*.env
*.log
*.tmp

# Ignore configuration and metadata files
CODE_OF_CONDUCT.md
CONTRIBUTING.md
docker-compose*.yml
LICENSE.txt
Makefile
README.md
docker-compose.yml
**/.DS_Store
**/venv
**/env
**/__pycache__
charts
Makefile
SECURITY.md

# Ignore specific scripts and files
infer_wavenext_onnx.py
_main.py

# Ignore any other .md files
*.md

# Ignore any other temporary or unnecessary files
*.swp
*.bak
*.tmp
*.orig

# Exclude specific files or directories if needed for the Docker build
# !important_file.py
# !important_directory/
models/matxa_onnx/best_model.onnx
2 changes: 2 additions & 0 deletions .github/workflows/docker-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,7 @@ jobs:
uses: docker/build-push-action@v5
with:
context: .
secrets: |
HF_TOKEN=${{ secrets.HF_TOKEN }}
push: true
tags: projecteaina/tts-api:latest, projecteaina/tts-api:${{ github.sha }}
11 changes: 3 additions & 8 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,13 @@ jobs:
- name: install espeak-ng
run: |
git clone -b dev-ca https://github.com/projecte-aina/espeak-ng
git clone https://github.com/espeak-ng/espeak-ng
pip install --upgrade pip
cd espeak-ng && sudo ./autogen.sh && sudo ./configure --prefix=/usr && sudo make && sudo make install
- name: install lingua-franca
run: pip install git+https://github.com/MycroftAI/lingua-franca.git@5bfd75fe5996fd364102a0eec3f714c9ddc9275c

- name: install model
- name: download model
run: |
wget -q http://share.laklak.eu/model_vits_ca/best_model_8khz.pth -P ./models/vits_ca/
mv ./models/vits_ca/best_model_8khz.pth ./models/vits_ca/best_model.pth
wget --header="Authorization: Bearer ${{ secrets.HF_TOKEN }}" https://huggingface.co/projecte-aina/matxa-tts-cat-multiaccent/resolve/main/matxa_multiaccent_wavenext_e2e.onnx -O ./models/matxa_onnx/best_model.onnx
- name: Run tests
run: pytest

Expand Down
7 changes: 4 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
*.pth
*.json
models
venv*
__pycache__
**/.env
!models/matxa_onnx/spk_ids.json
venv/
**/__pycache__/
.env
21 changes: 12 additions & 9 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
FROM python:3.10.12-slim
# RUN apt-get update && apt-get install -y --no-install-recommends wget gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*

# Install required packages for building eSpeak and general utilities

RUN apt-get update && apt-get install -y \
build-essential \
autoconf \
Expand All @@ -13,24 +13,27 @@ RUN apt-get update && apt-get install -y \
cmake \
&& rm -rf /var/lib/apt/lists/*

RUN git clone -b dev-ca https://github.com/projecte-aina/espeak-ng
# download huggingface gated model
RUN mkdir -p /app/models/matxa_onnx

RUN --mount=type=secret,id=HF_TOKEN \
wget --header="Authorization: Bearer $(cat /run/secrets/HF_TOKEN)" https://huggingface.co/projecte-aina/matxa-tts-cat-multiaccent/resolve/main/matxa_multiaccent_wavenext_e2e.onnx -O /app/models/matxa_onnx/best_model.onnx

# install espeak-ng

RUN git clone https://github.com/espeak-ng/espeak-ng
RUN pip install --upgrade pip && \
cd espeak-ng && \
./autogen.sh && \
./configure --prefix=/usr && \
make && \
make install

RUN pip install git+https://github.com/MycroftAI/lingua-franca.git@5bfd75fe5996fd364102a0eec3f714c9ddc9275c

WORKDIR /app
COPY ./requirements.txt /app
RUN python -m pip install --upgrade pip
RUN python -m pip install --no-cache-dir -r requirements.txt
RUN pip install --upgrade pip
RUN pip install --no-cache-dir -r requirements.txt

RUN wget -q http://share.laklak.eu/model_vits_ca/best_model_8khz.pth -P /app/models/vits_ca/
RUN mv /app/models/vits_ca/best_model_8khz.pth /app/models/vits_ca/best_model.pth
COPY . .

ENTRYPOINT python main.py --speech_speed ${SPEECH_SPEED} --mp_workers ${MP_WORKERS} --use_cuda ${USE_CUDA} --use_mp ${USE_MP}
ENTRYPOINT python main.py --speech_speed ${SPEECH_SPEED} --use_cuda ${USE_CUDA}
15 changes: 9 additions & 6 deletions Dockerfile.dev
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,14 @@ RUN apt-get update && apt-get install -y \
cmake \
&& rm -rf /var/lib/apt/lists/*

RUN git clone -b dev-ca https://github.com/projecte-aina/espeak-ng
# download huggingface gated model
RUN mkdir -p /app/models/matxa_onnx

RUN --mount=type=secret,id=HF_TOKEN \
wget --header="Authorization: Bearer $(cat /run/secrets/HF_TOKEN)" https://huggingface.co/projecte-aina/matxa-tts-cat-multiaccent/resolve/main/matxa_multiaccent_wavenext_e2e.onnx -O /app/models/matxa_onnx/best_model.onnx

RUN git clone https://github.com/espeak-ng/espeak-ng


RUN pip install --upgrade pip && \
cd espeak-ng && \
Expand All @@ -22,14 +29,10 @@ RUN pip install --upgrade pip && \
make && \
make install

RUN pip install git+https://github.com/MycroftAI/lingua-franca.git@5bfd75fe5996fd364102a0eec3f714c9ddc9275c

WORKDIR /app
# RUN wget -q http://share.laklak.eu/model_vits_ca/best_model_8khz.pth -P /app/models/vits_ca/
# RUN mv /app/models/vits_ca/best_model_8khz.pth /app/models/vits_ca/best_model.pth

COPY ./requirements.txt /app
RUN python -m pip install --upgrade pip
RUN python -m pip install --no-cache-dir -r requirements.txt

ENTRYPOINT python main.py --speech_speed ${SPEECH_SPEED} --mp_workers ${MP_WORKERS} --use_cuda ${USE_CUDA} --use_mp ${USE_MP} --show_details True --reload
ENTRYPOINT python main.py --speech_speed ${SPEECH_SPEED} --use_cuda ${USE_CUDA} --show_details True --reload
30 changes: 3 additions & 27 deletions Dockerfile.test
Original file line number Diff line number Diff line change
@@ -1,34 +1,10 @@
FROM python:3.10.12-slim
# RUN apt-get update && apt-get install -y --no-install-recommends wget gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*

# Install required packages for building eSpeak and general utilities
RUN apt-get update && apt-get install -y \
build-essential \
autoconf \
automake \
libtool \
pkg-config \
git \
wget \
cmake \
&& rm -rf /var/lib/apt/lists/*

RUN git clone -b dev-ca https://github.com/projecte-aina/espeak-ng

RUN pip install --upgrade pip && \
cd espeak-ng && \
./autogen.sh && \
./configure --prefix=/usr && \
make && \
make install

RUN pip install git+https://github.com/MycroftAI/lingua-franca.git@5bfd75fe5996fd364102a0eec3f714c9ddc9275c

WORKDIR /app

COPY ./requirements.txt /app
RUN python -m pip install --upgrade pip
RUN python -m pip install --no-cache-dir -r requirements.txt
RUN pip install pytest httpx pydub pytest-repeat
RUN pip install --upgrade pip
RUN pip install --no-cache-dir -r requirements.txt
RUN pip install pytest httpx pydub pytest-repeat --no-cache-dir

ENTRYPOINT pytest
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ stop:


act-run-tests:
gh act -j test -W '.github/workflows/tests.yml'
act --secret-file .env -j test -W '.github/workflows/tests.yml'
67 changes: 34 additions & 33 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# TTS API

RestFUL api and web interface to serve coqui TTS models
RestFUL api and web interface to serve matcha TTS models

## Installation

The requirements are tested for python 3.10. In order for coqui TTS to work, some dependencies should be installed.
The requirements are tested for python 3.10. In order for matcha TTS to work, some dependencies should be installed.

1. Update your system's package list and install the required packages for building eSpeak and general utilities:
```bash
Expand All @@ -18,9 +18,10 @@ sudo apt update && sudo apt install -y \
wget \
cmake
```

2. Clone the eSpeak-ng repository and build it:
```bash
git clone -b dev-ca https://github.com/projecte-aina/espeak-ng
git clone https://github.com/espeak-ng/espeak-ng
cd espeak-ng && \
sudo ./autogen.sh && \
sudo ./configure --prefix=/usr && \
Expand All @@ -34,14 +35,29 @@ Later simply:
python -m pip install --upgrade pip
```

In order to synthesize, the actual model needs to be downloaded and the paths in the config file need to be changed (replacing `/opt` with the top directory of the repository). The model can be downloaded from [http://share.laklak.eu/model_vits_ca/best_model.pth](http://share.laklak.eu/model_vits_ca/best_model.pth) to the models directory.

> [!NOTE]
> The model **best_model.onnx** is requiered, you have to download by yourself.
Download the model from HuggingFace
https://huggingface.co/projecte-aina/matxa-tts-cat-multiaccent/resolve/main/matcha_multispeaker_cat_all_opset_15_10_steps.onnx

Note: You will need a Huggingface account because the model privacity is setted to gated.

Rename the onnx model to best_model.onnx and move it to ./models/matxa_onnx folder

or download using wget

```bash
wget --header="Authorization: Bearer REPLACE_WITH_YOUR_HF_TOKEN" https://huggingface.co/projecte-aina/matxa-tts-cat-multiaccent/resolve/main/matxa_multiaccent_wavenext_e2e.onnx -O ./models/matxa_onnx/best_model.onnx
```

## Launch

tts-api uses `FastAPI` and `uvicorn` under the hood. For now, in order to launch:

```
python server/server.py --model_path models/vits_ca/best_model.pth --config_path models/vits_ca/config.json --port 8001
python server/server.py --model_path models/matxa_onnx/best_model.onnx --port 8001
```
that receives the calls from `0.0.0.0:8001`, or simply
```
Expand All @@ -51,16 +67,16 @@ which gets the calls from `0.0.0.0:8000` by default

## Usage

tts-api has three inference endpoints, two openapi ones (as can be seen via `/docs`) and one websocket endpoint:
tts-api has three inference endpoints, two openapi ones (as can be seen via `/docs`)

* `/api/tts`: main inference endpoint
* `/audio-stream`: websocket endpoint; capable of doing async inference, as soon as the first segment is synthesized the audio starts streaming.
#

The example for `/api/tts` can be found in `/docs`. The websocket request is contingent on the communication with the client, hence we provide an example client at the `/websocket-demo` endpoint. For the `api/tts` the call is as the following:
The example for `/api/tts` can be found in `/docs`. For the `api/tts` the call is as the following:

```
curl --location --request POST 'http://localhost:8080/api/tts' --header 'Content-Type: application/json' --data-raw '{
"voice": "f_cen_81",
curl --location --request POST 'http://localhost:8000/api/tts' --header 'Content-Type: application/json' --data-raw '{
"voice": "quim",
"type": "text",
"text": "El Consell s’ha reunit avui per darrera vegada abans de les eleccions. Divendres vinent, tant el president com els consellers ja estaran en funcions. A l’ordre del dia d’avui tampoc no hi havia l’aprovació del requisit lingüístic, és a dir la normativa que ha de regular la capacitació lingüística dels aspirants a accedir a un lloc en la Funció Pública Valenciana.",
"language": "ca-es" }' --output tts.wav
Expand All @@ -73,7 +89,7 @@ To launch using lastest version available on the Dockerhub:


```
docker run --shm-size=1gb -p 8080:8000 projecteaina/tts-api:latest
docker run -p 8000:8000 projecteaina/tts-api:latest
```

[Check out the documentation available on the Dockerhub](https://hub.docker.com/r/projecteaina/tts-api)
Expand All @@ -87,9 +103,9 @@ docker build -t tts-api .

To launch:
```
docker run --shm-size=1gb -p 8080:8000 tts-api
docker run -p 8000:8000 tts-api
```
The default entrypoint puts the web interface to `http://0.0.0.0:8080/`.
The default entrypoint puts the web interface to `http://0.0.0.0:8000/`.


## Develop in docker
Expand All @@ -101,14 +117,7 @@ To run in dev mode run the following command.
make dev
```

> [!NOTE]
> The model **best_model.pth** is requiered, you have to download by yourself.
```bash
wget -q http://share.laklak.eu/model_vits_ca/best_model_8khz.pth -P models/vits_ca/
```
```bash
mv models/vits_ca/best_model_8khz.pth models/vits_ca/best_model.pth
```


## REST API Endpoints

Expand All @@ -120,15 +129,14 @@ mv models/vits_ca/best_model_8khz.pth models/vits_ca/best_model.pth

| **Parameter** | **Type** | **Description** |
|---------------|--------------------|------------------------------------------------------------|
| language | string | ISO language code (e.g., "ca-es") |
| language | string | ISO language code (e.g., "ca-es", "ca-ba", "ca-nw", "ca-va") |
| voice | string | Name of the voice to use |
| type | string | Type of input text ("text" or "ssml") |
| text | string | Text to be synthesized (if type is "ssml", enclose in tags) |


**NOTES:**
- ssml format is not available yet.
- Currently, only "ca-es" language is supported, and will be applied by default

**Successful Response:**

Expand All @@ -151,10 +159,9 @@ POST /api/tts
#### Command line deployment arguments
| **Argument** | **Type** | **Default** | **Description** |
|------------------------|----------|-----------------------------------------|-------------------------------------------------------------------------------|
| mp_workers | int | 2 | Number of CPUs used for multiprocessing. |
| speech_speed | float | 1.0 | Change the speech speed. |

- mp_workers: the "mp_workers" argument specifies the number of separate processes used for inference. For example, if mp_workers is set to 2 and the input consists of 2 sentences, there will be a process assigned to each sentence, speeding up inference.


- The "speech_speed" argument refers to a parameter that adjusts the rate at which speech sounds in an audio output, with higher values resulting in faster speech, and lower values leading to slower speech.

Expand All @@ -168,22 +175,16 @@ To deploy this project, you will need to add the following environment variables

`SPEECH_SPEED`

`MP_WORKERS`

`USE_CUDA`

`USE_MP`

`SHM_SIZE`
`HF_TOKEN` #Required if you build the docker image from this repository, you need a Huggingface token to download the tts model.


Example of .env file
```bash
SPEECH_SPEED=1.0
MP_WORKERS=4
USE_CUDA=False
USE_MP=True
SHM_SIZE=2gb
HF_TOKEN=REPLACE_WITH_YOUR_HUGGINGFACE_TOKEN
```


Expand Down
Loading

0 comments on commit 1412f52

Please sign in to comment.