-
Notifications
You must be signed in to change notification settings - Fork 58
/
transcription_chunker.py
113 lines (92 loc) · 5.03 KB
/
transcription_chunker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import logging
import os
from io import StringIO
from io import BytesIO
import webvtt
from langchain.text_splitter import RecursiveCharacterTextSplitter
from .base_chunker import BaseChunker
class TranscriptionChunker(BaseChunker):
"""
TranscriptionChunker is a class designed to process and chunk transcription text content, specifically from WebVTT (Web Video Text Tracks) format files. It utilizes the RecursiveCharacterTextSplitter to segment the transcription into manageable chunks, considering token limits and content structure.
Initialization:
---------------
The TranscriptionChunker is initialized with the following parameters:
- data (str): The transcription text content to be chunked.
- max_chunk_size (int, optional): The maximum size of each chunk in tokens. Defaults to 2048 tokens or the value specified in the `NUM_TOKENS` environment variable.
- token_overlap (int, optional): The number of overlapping tokens between consecutive chunks. Defaults to 100 tokens.
Methods:
--------
- get_chunks():
Processes the transcription text and generates chunks based on the specified chunking parameters.
It first processes the WebVTT file, extracts the text, and then splits the content into chunks.
The method includes a mechanism to summarize the content and attaches this summary to each chunk.
- _vtt_process():
Converts the WebVTT content into a continuous text block, retaining speaker changes.
It processes each caption, merging text from the same speaker and separating segments by speaker changes.
- _chunk_document_content():
Splits the processed document content into chunks using the RecursiveCharacterTextSplitter.
This method yields each chunk as it is created.
Attributes:
-----------
- max_chunk_size (int): Maximum allowed tokens per chunk.
- token_overlap (int): Number of overlapping tokens between chunks.
- document_content (str): The content of the document.
- aoai_client: An instance for generating summaries and processing content with OpenAI models.
- token_estimator: A utility for estimating the number of tokens in a given text.
"""
def __init__(self, data, max_chunk_size=None, token_overlap=None):
"""
Initializes the TranscriptionChunker with the given data and sets up chunking parameters from environment variables.
Args:
data (str): The document content to be chunked.
"""
super().__init__(data)
self.max_chunk_size = max_chunk_size or int(os.getenv("NUM_TOKENS", "2048"))
self.token_overlap = token_overlap or 100
def get_chunks(self):
chunks = []
logging.info(f"[transcription_chunker][{self.filename}] Running get_chunks.")
# Extract the text from the vtt file
text = self._vtt_process()
logging.debug(f"[transcription_chunker][{self.filename}] transcription text: {text[:100]}")
# Get the summary of the text
prompt = f"Provide clearly elaborated summary along with the keypoints and values mentioned for the transcript of a conversation: {text} "
summary = self.aoai_client.get_completion(prompt)
text_chunks = self._chunk_document_content(text)
chunk_id = 0
for text_chunk in text_chunks:
chunk_id += 1
chunk_size = self.token_estimator.estimate_tokens(text_chunk)
if chunk_size > self.max_chunk_size:
logging.debug(f"[transcription_chunker][{self.filename}] truncating {chunk_size} size chunk to fit within {self.max_chunk_size} tokens")
text_chunk = self._truncate_chunk(text_chunk)
chunk_dict = self._create_chunk(chunk_id=chunk_id, content=text_chunk, embedding_text=summary, summary=summary)
chunks.append(chunk_dict)
return chunks
def _vtt_process(self):
blob_data = self.document_bytes
blob_stream = BytesIO(blob_data)
vtt = webvtt.read_buffer(blob_stream)
data, text, voice = [], "", ""
for caption in vtt:
current_voice = caption.voice or ""
if current_voice != voice:
if text:
data.append(text.replace("\n", " "))
voice, text = current_voice, f"{voice}: {caption.text} " if voice else caption.text + " "
else:
text += caption.text + " "
if text:
data.append(text.replace("\n", " "))
return "\n".join(data).strip()
def _chunk_document_content(self, text):
sentence_endings = [".", "!", "?"]
word_breaks = [" ", "\n", "\t"]
splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
separators=sentence_endings + word_breaks,
chunk_size=self.max_chunk_size,
chunk_overlap=self.token_overlap
)
chunked_content_list = splitter.split_text(text)
for chunked_content in chunked_content_list:
yield chunked_content # type: ignore