Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Diarization enhancements + dependencies update #157

Merged
merged 13 commits into from
Jul 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .env
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ MAX_WAIT=0.1
# docker cmd: -v /path/to/custom/model:/app/models/custom
WHISPER_MODEL="large-v2"
# The compute_type parameter is used to control the precision of the model. You can choose between:
# "int8", "int8_float16", "int16", "float_16". The default value is "int8_float16", which is the fastest option with
# minimal loss in accuracy using the `large-v2` model.
# "int8", "int8_float16", "int16", "float_16", "float32". The default value is "float16".
COMPUTE_TYPE="float16"
# The extra_languages parameter is used to control the languages that need an extra model to be loaded.
# You can specify multiple languages separated by a comma. The available languages are: `he` (Hebrew).
Expand Down
1,723 changes: 780 additions & 943 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ aiofiles = ">=23.1.0"
argon2-cffi = ">=21.3.0"
cython = ">=0.29.24"
fastapi = ">=0.96.0"
faster-whisper = ">=0.3.0"
faster-whisper = ">=0.7"
ffmpeg-python = ">=0.2.0"
loguru = ">=0.6.0"
nemo-toolkit = { version = ">=1.17.0", extras = ["asr"] }
nemo-toolkit = { version = "1.19.1", extras = ["asr"] }
numpy = "==1.23.1"
num2words = ">=0.5.12"
onnxruntime = ">=1.15.0"
Expand Down
248 changes: 227 additions & 21 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,94 @@
CortexPayload,
CortexUrlResponse,
CortexYoutubeResponse,
Utterance,
Word,
YouTubeResponse,
)


def test_word() -> None:
"""Test the Word model."""
word = Word(
word="test",
start=0.0,
end=1.0,
score=0.9,
)
assert word.word == "test"
assert word.start == 0.0
assert word.end == 1.0
assert word.score == 0.9


def test_utterance() -> None:
"""Test the Utterance model."""
utterance = Utterance(
text="This is a test.",
start=0.0,
end=4.0,
speaker=0,
words=[
Word(
word="This",
start=0.0,
end=1.0,
score=0.9,
),
Word(
word="is",
start=1.0,
end=2.0,
score=0.75,
),
Word(
word="a",
start=2.0,
end=3.0,
score=0.8,
),
Word(
word="test.",
start=3.0,
end=4.0,
score=0.85,
),
],
)
assert utterance.text == "This is a test."
assert utterance.start == 0.0
assert utterance.end == 4.0
assert utterance.speaker == 0
assert utterance.words is not None
assert utterance.words == [
Word(
word="This",
start=0.0,
end=1.0,
score=0.9,
),
Word(
word="is",
start=1.0,
end=2.0,
score=0.75,
),
Word(
word="a",
start=2.0,
end=3.0,
score=0.8,
),
Word(
word="test.",
start=3.0,
end=4.0,
score=0.85,
),
]
assert isinstance(utterance.words[0], Word)


def test_audio_request() -> None:
"""Test the AudioRequest model."""
request = AudioRequest(
Expand Down Expand Up @@ -77,8 +161,20 @@ def test_audio_response() -> None:

response = AudioResponse(
utterances=[
{"text": "Never gonna give you up", "start": 0.0, "end": 3.0},
{"text": "Never gonna let you down", "start": 3.0, "end": 6.0},
Utterance(
text="Never gonna give you up",
start=0.0,
end=3.0,
speaker=0,
words=[],
),
Utterance(
text="Never gonna let you down",
start=3.0,
end=6.0,
speaker=1,
words=[],
),
],
audio_duration=6.0,
alignment=True,
Expand All @@ -92,8 +188,20 @@ def test_audio_response() -> None:
internal_vad=False,
)
assert response.utterances == [
{"text": "Never gonna give you up", "start": 0.0, "end": 3.0},
{"text": "Never gonna let you down", "start": 3.0, "end": 6.0},
Utterance(
text="Never gonna give you up",
start=0.0,
end=3.0,
speaker=0,
words=[],
),
Utterance(
text="Never gonna let you down",
start=3.0,
end=6.0,
speaker=1,
words=[],
),
]
assert response.audio_duration == 6.0
assert response.alignment is True
Expand Down Expand Up @@ -134,16 +242,30 @@ def test_base_request_default() -> None:

def test_base_request_invalid() -> None:
"""Test the BaseRequest model with invalid data."""
with pytest.raises(ValueError, match="timestamps must be one of 'hms', 'ms', 's'."):
with pytest.raises(
ValueError, match="`timestamps` must be one of 'hms', 'ms', 's'."
):
BaseRequest(timestamps="invalid")


def test_base_response() -> None:
"""Test the BaseResponse model."""
response = BaseResponse(
utterances=[
{"text": "Never gonna give you up", "start": 0.0, "end": 3.0},
{"text": "Never gonna let you down", "start": 3.0, "end": 6.0},
Utterance(
text="Never gonna give you up",
start=0.0,
end=3.0,
speaker=0,
words=[],
),
Utterance(
text="Never gonna let you down",
start=3.0,
end=6.0,
speaker=1,
words=[],
),
],
audio_duration=6.0,
alignment=True,
Expand All @@ -156,8 +278,20 @@ def test_base_response() -> None:
internal_vad=False,
)
assert response.utterances == [
{"text": "Never gonna give you up", "start": 0.0, "end": 3.0},
{"text": "Never gonna let you down", "start": 3.0, "end": 6.0},
Utterance(
text="Never gonna give you up",
start=0.0,
end=3.0,
speaker=0,
words=[],
),
Utterance(
text="Never gonna let you down",
start=3.0,
end=6.0,
speaker=1,
words=[],
),
]
assert response.audio_duration == 6.0
assert response.alignment is True
Expand Down Expand Up @@ -215,8 +349,20 @@ def test_cortex_url_response() -> None:
"""Test the CortexUrlResponse model."""
response = CortexUrlResponse(
utterances=[
{"text": "Never gonna give you up", "start": 0.0, "end": 3.0},
{"text": "Never gonna let you down", "start": 3.0, "end": 6.0},
Utterance(
text="Never gonna give you up",
start=0.0,
end=3.0,
speaker=0,
words=[],
),
Utterance(
text="Never gonna let you down",
start=3.0,
end=6.0,
speaker=1,
words=[],
),
],
audio_duration=6.0,
alignment=True,
Expand All @@ -232,8 +378,20 @@ def test_cortex_url_response() -> None:
request_id="test_request_id",
)
assert response.utterances == [
{"text": "Never gonna give you up", "start": 0.0, "end": 3.0},
{"text": "Never gonna let you down", "start": 3.0, "end": 6.0},
Utterance(
text="Never gonna give you up",
start=0.0,
end=3.0,
speaker=0,
words=[],
),
Utterance(
text="Never gonna let you down",
start=3.0,
end=6.0,
speaker=1,
words=[],
),
]
assert response.audio_duration == 6.0
assert response.alignment is True
Expand All @@ -253,8 +411,20 @@ def test_cortex_youtube_response() -> None:
"""Test the CortexYoutubeResponse model."""
response = CortexYoutubeResponse(
utterances=[
{"text": "Never gonna give you up", "start": 0.0, "end": 3.0},
{"text": "Never gonna let you down", "start": 3.0, "end": 6.0},
Utterance(
text="Never gonna give you up",
start=0.0,
end=3.0,
speaker=0,
words=[],
),
Utterance(
text="Never gonna let you down",
start=3.0,
end=6.0,
speaker=1,
words=[],
),
],
audio_duration=6.0,
alignment=True,
Expand All @@ -270,8 +440,20 @@ def test_cortex_youtube_response() -> None:
request_id="test_request_id",
)
assert response.utterances == [
{"text": "Never gonna give you up", "start": 0.0, "end": 3.0},
{"text": "Never gonna let you down", "start": 3.0, "end": 6.0},
Utterance(
text="Never gonna give you up",
start=0.0,
end=3.0,
speaker=0,
words=[],
),
Utterance(
text="Never gonna let you down",
start=3.0,
end=6.0,
speaker=1,
words=[],
),
]
assert response.audio_duration == 6.0
assert response.alignment is True
Expand All @@ -291,8 +473,20 @@ def test_youtube_response() -> None:
"""Test the YouTubeResponse model."""
response = YouTubeResponse(
utterances=[
{"text": "Never gonna give you up", "start": 0.0, "end": 3.0},
{"text": "Never gonna let you down", "start": 3.0, "end": 6.0},
Utterance(
text="Never gonna give you up",
start=0.0,
end=3.0,
speaker=0,
words=[],
),
Utterance(
text="Never gonna let you down",
start=3.0,
end=6.0,
speaker=1,
words=[],
),
],
audio_duration=6.0,
alignment=True,
Expand All @@ -306,8 +500,20 @@ def test_youtube_response() -> None:
video_url="https://www.youtube.com/watch?v=dQw4w9WgXcQ",
)
assert response.utterances == [
{"text": "Never gonna give you up", "start": 0.0, "end": 3.0},
{"text": "Never gonna let you down", "start": 3.0, "end": 6.0},
Utterance(
text="Never gonna give you up",
start=0.0,
end=3.0,
speaker=0,
words=[],
),
Utterance(
text="Never gonna let you down",
start=3.0,
end=6.0,
speaker=1,
words=[],
),
]
assert response.audio_duration == 6.0
assert response.alignment is True
Expand Down
Loading