OwlAIProject · etown · Mar 1, 2024 · Mar 1, 2024 · Mar 1, 2024 · Mar 1, 2024
diff --git a/alembic/versions/b6aff0a993d7_add_person_and_voicesamples.py b/alembic/versions/b6aff0a993d7_add_person_and_voicesamples.py
@@ -0,0 +1,55 @@
+"""Add person and voicesamples
+
+Revision ID: b6aff0a993d7
+Revises: 33bddba74d25
+Create Date: 2024-03-01 08:56:55.205553
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+import sqlmodel 
+
+
+# revision identifiers, used by Alembic.
+revision: str = 'b6aff0a993d7'
+down_revision: Union[str, None] = '33bddba74d25'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # Use batch operations to support SQLite ALTER TABLE for adding constraints
+    with op.batch_alter_table('utterance', schema=None) as batch_op:
+        batch_op.add_column(sa.Column('person_id', sa.Integer(), nullable=True))
+        batch_op.create_foreign_key('fk_utterance_person', 'person', ['person_id'], ['id'])
+
+    op.create_table('person',
+        sa.Column('created_at', sa.DateTime(), nullable=False),
+        sa.Column('updated_at', sa.DateTime(), nullable=False),
+        sa.Column('id', sa.Integer(), nullable=False),
+        sa.Column('first_name', sqlmodel.sql.sqltypes.AutoString(), nullable=False),
+        sa.Column('last_name', sqlmodel.sql.sqltypes.AutoString(), nullable=False),
+        sa.PrimaryKeyConstraint('id')
+    )
+    op.create_table('voicesample',
+        sa.Column('created_at', sa.DateTime(), nullable=False),
+        sa.Column('updated_at', sa.DateTime(), nullable=False),
+        sa.Column('id', sa.Integer(), nullable=False),
+        sa.Column('filepath', sqlmodel.sql.sqltypes.AutoString(), nullable=False),
+        sa.Column('speaker_embeddings', sa.JSON(), nullable=True),
+        sa.Column('person_id', sa.Integer(), nullable=True),
+        sa.ForeignKeyConstraint(['person_id'], ['person.id'], name='fk_voicesample_person'),
+        sa.PrimaryKeyConstraint('id')
+    )
+
+def downgrade() -> None:
+    # Use batch operations for dropping column with SQLite
+    with op.batch_alter_table('utterance', schema=None) as batch_op:
+        batch_op.drop_constraint('fk_utterance_person', type_='foreignkey')
+        batch_op.drop_column('person_id')
+
+    # Commands for dropping tables remain unchanged
+    op.drop_table('voicesample')
+    op.drop_table('person')
diff --git a/owl/core/cli.py b/owl/core/cli.py
@@ -13,6 +13,9 @@
 import subprocess
 from alembic import command
 from alembic.config import Config
+from ..database.database import Database
+from ..database.crud import create_person, create_voice_sample
+from ..models.schemas import Person, VoiceSample
 
 import click
 from rich.console import Console
@@ -202,6 +205,40 @@ def create_migration(config: Configuration, message: str):
 
     console.log(f"[bold green]Migration script generated with message: '{message}'")
 
+####################################################################################################
+# Persons
+####################################################################################################
+
+@cli.command()
+@add_options(_config_options) 
+@click.option('--first-name', required=True, help='First name of the person')
+@click.option('--last-name', required=True, help='Last name of the person')
+@click.option('--voice-sample-path', required=True, help='Path to the voice sample file')
+def enroll_speaker(config: Configuration, first_name: str, last_name: str, voice_sample_path: str):
+    """Enroll a new person with a voice sample."""
+    console = Console()
+    console.log("[bold green]Enrolling speaker...")
+
+    database = Database(config.database)
+    with next(database.get_db()) as db:
+        person = create_person(db, Person(first_name=first_name, last_name=last_name))
+    sample_directory = config.speaker_identification.voice_sample_directory
+    sample_directory = os.path.join(sample_directory, str(person.id))
+    os.makedirs(sample_directory, exist_ok=True)
+
+    filename = os.path.basename(voice_sample_path)
+    extension = os.path.splitext(filename)[1]
+
+    sample_file_path = os.path.join(sample_directory, f"{uuid.uuid1().hex}.{extension[1:]}")
+
+    with next(database.get_db()) as db:
+        voice_sample = create_voice_sample(db, VoiceSample(person_id=person.id, filepath=sample_file_path))
+    with open(voice_sample_path, "rb") as f:
+        with open(sample_file_path, "wb") as f2:
+            f2.write(f.read())
+
+    console.log(f"[bold green]Enrolled new person: '{person.id} ({voice_sample.id})'")
+
 ####################################################################################################
 # Server
 ####################################################################################################

diff --git a/owl/core/config.py b/owl/core/config.py
@@ -49,6 +49,10 @@ class StreamingTranscriptionConfiguration(BaseModel):
 class AsyncTranscriptionConfiguration(BaseModel):
     provider: str
 
+class SpeakerIdentificationConfiguration(BaseModel):
+    provider: str
+    voice_sample_directory: Optional[str] = None
+
 class DatabaseConfiguration(BaseModel):
     url: str
 
@@ -104,4 +108,5 @@ def load_config_yaml(cls, config_filepath: str) -> 'Configuration':
     conversation_endpointing: ConversationEndpointingConfiguration
     notification: NotificationConfiguration
     udp: UDPConfiguration
-    bing: BingConfiguration | None = None
+    bing: BingConfiguration | None = None
+    speaker_identification: SpeakerIdentificationConfiguration | None = None
diff --git a/owl/database/crud.py b/owl/database/crud.py
@@ -1,5 +1,5 @@
 from sqlmodel import SQLModel, Session, select
-from ..models.schemas import Transcription, Conversation, Utterance, Location, CaptureSegment, Capture, ConversationState
+from ..models.schemas import Transcription, Conversation, Utterance, Location, CaptureSegment, Capture, ConversationState, Person, VoiceSample
 from typing import List, Optional
 from sqlalchemy.orm import joinedload, selectinload
 from sqlalchemy import desc, func, or_
@@ -8,6 +8,18 @@
 
 logger = logging.getLogger(__name__)
 
+def create_person(db: Session, person: Person) -> Person:
+    db.add(person)
+    db.commit()
+    db.refresh(person)
+    return person
+
+def create_voice_sample(db: Session, voice_sample: VoiceSample) -> VoiceSample:
+    db.add(voice_sample)
+    db.commit()
+    db.refresh(voice_sample)
+    return voice_sample
+
 def create_utterance(db: Session, utterance: Utterance) -> Utterance:
     db.add(utterance)
     db.commit()

diff --git a/owl/models/schemas.py b/owl/models/schemas.py
@@ -1,5 +1,5 @@
 from typing import List, Optional
-from sqlmodel import SQLModel, Field, Relationship
+from sqlmodel import SQLModel, Field, Relationship, Column, JSON
 from datetime import datetime, timezone
 from pydantic import BaseModel
 from enum import Enum
@@ -36,6 +36,8 @@ class Utterance(CreatedAtMixin, table=True):
     transcription: "Transcription" = Relationship(back_populates="utterances")
 
     words: List[Word] = Relationship(back_populates="utterance", sa_relationship_kwargs={"cascade": "all, delete-orphan"})
+    person_id: Optional[int] = Field(default=None, foreign_key="person.id")
+    person: Optional["Person"] = Relationship(back_populates="utterances")
 
 class Transcription(CreatedAtMixin, table=True):
     id: Optional[int] = Field(default=None, primary_key=True)
@@ -106,6 +108,19 @@ class CaptureSegment(CreatedAtMixin, table=True):
 
     conversation: Optional[Conversation] = Relationship(back_populates="capture_segment_file")
 
+class Person(CreatedAtMixin, table=True):
+    id: Optional[int] = Field(default=None, primary_key=True)
+    first_name: str
+    last_name: str
+    voice_samples: List["VoiceSample"] = Relationship(back_populates="person")
+    utterances: List[Utterance] = Relationship(back_populates="person")
+
+class VoiceSample(CreatedAtMixin, table=True):
+    id: Optional[int] = Field(default=None, primary_key=True)
+    filepath: str = Field(...)
+    speaker_embeddings: dict = Field(default={}, sa_column=Column(JSON))
+    person_id: Optional[int] = Field(default=None, foreign_key="person.id")
+    person: Optional["Person"] = Relationship(back_populates="voice_samples")
 
 #  API Response Models
 #  https://sqlmodel.tiangolo.com/tutorial/fastapi/relationships/#dont-include-all-the-data

diff --git a/owl/sample_config.yaml b/owl/sample_config.yaml
@@ -86,6 +86,10 @@ udp:
   host: '0.0.0.0'
   port: 8001
 
+speaker_identification:
+  provider: speech_brain
+  voice_sample_directory: voice_samples
+
 # To enable web search
 # bing:
 #   subscription_key: your_bing_subscription_service_key
diff --git a/owl/services/stt/speaker_identification/__init__.py b/owl/services/stt/speaker_identification/__init__.py
diff --git a/owl/services/stt/speaker_identification/abstract_speaker_identification_service.py b/owl/services/stt/speaker_identification/abstract_speaker_identification_service.py
@@ -0,0 +1,8 @@
+from abc import ABC, abstractmethod
+from ....models.schemas import Transcript
+
+class AbstractSpeakerIdentificationService(ABC):
+
+    @abstractmethod
+    async def identifiy_speakers(self, transcript: Transcript, persons) -> Transcript:
+        pass