diff --git a/pipes/search/src/components/example-search-cards.tsx b/pipes/search/src/components/example-search-cards.tsx index 588be3870..c8ff98209 100644 --- a/pipes/search/src/components/example-search-cards.tsx +++ b/pipes/search/src/components/example-search-cards.tsx @@ -1,4 +1,4 @@ -import React from "react"; +import React, { useEffect, useState } from "react"; import { Card, CardContent } from "@/components/ui/card"; import { Search, Mail, Clock, AlertCircle } from "lucide-react"; import { Badge } from "./ui/badge"; @@ -24,33 +24,37 @@ interface ExampleSearchCardsProps { onSelect: (example: ExampleSearch) => void; } -const exampleSearches: ExampleSearch[] = [ - { - title: "summarize last hour meeting", - contentType: "audio", - limit: 120, - minLength: 10, - startDate: new Date(Date.now() - 60 * 60 * 1000), // 1 hour ago - }, - { - title: "summarize my mails", - contentType: "ocr", - windowName: "gmail", - limit: 25, - minLength: 50, - startDate: new Date(new Date().setHours(0, 0, 0, 0)), // since midnight local time - }, - { - title: "time spent last hour", - contentType: "ocr", - limit: 25, - minLength: 50, - startDate: new Date(Date.now() - 60 * 60 * 1000), // 1 hour ago - }, -]; - export function ExampleSearchCards({ onSelect }: ExampleSearchCardsProps) { + const [exampleSearches, setExampleSearches] = useState([]); const { health } = useHealthCheck(); + + useEffect(() => { + setExampleSearches([ + { + title: "summarize last hour meeting", + contentType: "audio", + limit: 120, + minLength: 10, + startDate: new Date(Date.now() - 60 * 60 * 1000), // 1 hour ago + }, + { + title: "summarize my mails", + contentType: "ocr", + windowName: "gmail", + limit: 25, + minLength: 50, + startDate: new Date(new Date().setHours(0, 0, 0, 0)), // since midnight local time + }, + { + title: "time spent last hour", + contentType: "ocr", + limit: 25, + minLength: 50, + startDate: new Date(Date.now() - 60 * 60 * 1000), // 1 hour ago + }, + ]); + }, []); + const getIcon = (title: string) => { switch (title) { case "summarize last hour meeting": diff --git a/pipes/search/src/components/search-chat.tsx b/pipes/search/src/components/search-chat.tsx index eb510add1..9fb0d6fc3 100644 --- a/pipes/search/src/components/search-chat.tsx +++ b/pipes/search/src/components/search-chat.tsx @@ -992,7 +992,11 @@ export function SearchChat() { {item.content.filePath && item.content.filePath.trim() !== "" ? (
- +
) : (

diff --git a/pipes/search/src/components/video.tsx b/pipes/search/src/components/video.tsx index 20adfda27..0028a7f23 100644 --- a/pipes/search/src/components/video.tsx +++ b/pipes/search/src/components/video.tsx @@ -1,15 +1,19 @@ -import { memo, useCallback, useEffect, useState } from "react"; +import { memo, useCallback, useEffect, useState, useRef, useMemo } from "react"; +import { getMediaFile } from "@/lib/actions/video-actions"; import { cn } from "@/lib/utils"; -import { getMediaFile } from '@/lib/actions/video-actions' export const VideoComponent = memo(function VideoComponent({ filePath, customDescription, className, + startTime, + endTime, }: { filePath: string; customDescription?: string; className?: string; + startTime?: number; + endTime?: number; }) { const [mediaSrc, setMediaSrc] = useState(null); const [error, setError] = useState(null); @@ -28,39 +32,19 @@ export const VideoComponent = memo(function VideoComponent({ const renderFileLink = () => ( // TODO button open link -

+

{customDescription || filePath} -

+
); - const getMimeType = (path: string): string => { - const ext = path.split(".").pop()?.toLowerCase(); - switch (ext) { - case "mp4": - return "video/mp4"; - case "webm": - return "video/webm"; - case "ogg": - return "video/ogg"; - case "mp3": - return "audio/mpeg"; - case "wav": - return "audio/wav"; - default: - return isAudio ? "audio/mpeg" : "video/mp4"; - } - }; - useEffect(() => { async function loadMedia() { try { console.log("Loading media:", filePath); const sanitizedPath = sanitizeFilePath(filePath); console.log("Sanitized path:", sanitizedPath); - if (!sanitizedPath) { - throw new Error("Invalid file path"); - } + // Set isAudio based on path check setIsAudio( sanitizedPath.toLowerCase().includes("input") || sanitizedPath.toLowerCase().includes("output") @@ -115,12 +99,11 @@ export const VideoComponent = memo(function VideoComponent({ return (
{isAudio ? ( -
- -
+ ) : (
); }); + +const AudioPlayer = memo(function AudioPlayer({ + startTime, + endTime, + mediaSrc, +}: { + startTime?: number; + endTime?: number; + mediaSrc: string; +}) { + const [duration, setDuration] = useState(0); + const [currentTime, setCurrentTime] = useState(0); + const [isPlaying, setIsPlaying] = useState(false); + const audioRef = useRef(null); + + const audioElement = useMemo( + () => ( + + ), + [mediaSrc, startTime, currentTime] + ); + + const togglePlay = async () => { + if (!audioRef.current) return; + + try { + if (isPlaying) { + audioRef.current.pause(); + } else { + await audioRef.current.play(); + } + setIsPlaying(!isPlaying); + } catch (error) { + console.error("Playback failed:", error); + setIsPlaying(false); + } + }; + + const handleTimeChange = async (e: React.ChangeEvent) => { + if (!audioRef.current) return; + + const time = parseFloat(e.target.value); + const wasPlaying = isPlaying; + + if (wasPlaying) { + audioRef.current.pause(); + } + + // Set the time directly on the audio element first + audioRef.current.currentTime = time; + // Then update the state + setCurrentTime(time); + + if (wasPlaying) { + try { + await audioRef.current.play(); + } catch (error) { + console.error("Playback failed:", error); + setIsPlaying(false); + } + } + }; + + return ( +
+
+ {startTime !== null && ( +
+
+ Start +
+
+ )} + {endTime !== null && ( +
+
+ End +
+
+ )} + +
+
+
+
+
+ +
+ {audioElement} +
+
+ ); +}); diff --git a/screenpipe-app-tauri/components/identify-speakers.tsx b/screenpipe-app-tauri/components/identify-speakers.tsx index 72bde93fb..bf048e844 100644 --- a/screenpipe-app-tauri/components/identify-speakers.tsx +++ b/screenpipe-app-tauri/components/identify-speakers.tsx @@ -226,6 +226,8 @@ export default function IdentifySpeakers({ { path: longestAudioSample?.path || "", transcript: longestAudioSample?.transcript || "", + startTime: longestAudioSample?.startTime, + endTime: longestAudioSample?.endTime, }, ], }, @@ -269,7 +271,7 @@ export default function IdentifySpeakers({ async (speaker: Speaker) => { const durations: Map = new Map(); for (const sample of speaker.metadata?.audioSamples || []) { - const size = await getFileSize(sample.path); + const size = (sample.endTime ?? 0) - (sample.startTime ?? 0); durations.set(sample.path, size); } @@ -637,6 +639,8 @@ export default function IdentifySpeakers({ ))} @@ -681,6 +685,8 @@ export default function IdentifySpeakers({ @@ -1078,6 +1084,8 @@ export default function IdentifySpeakers({ key={index} filePath={sample.path} customDescription={`transcript: ${sample.transcript}`} + startTime={sample.startTime} + endTime={sample.endTime} /> ))}
@@ -1123,6 +1131,8 @@ export default function IdentifySpeakers({ filePath={sample.path} customDescription={`transcript: ${sample.transcript}`} className="max-w-[300px]" + startTime={sample.startTime} + endTime={sample.endTime} /> ) )} diff --git a/screenpipe-app-tauri/components/video.tsx b/screenpipe-app-tauri/components/video.tsx index 3fcd9af80..2afcda48c 100644 --- a/screenpipe-app-tauri/components/video.tsx +++ b/screenpipe-app-tauri/components/video.tsx @@ -1,4 +1,4 @@ -import { memo, useCallback, useEffect, useState } from "react"; +import { memo, useCallback, useEffect, useState, useRef, useMemo } from "react"; import { readFile, open } from "@tauri-apps/plugin-fs"; import { platform } from "@tauri-apps/plugin-os"; import { cn } from "@/lib/utils"; @@ -7,10 +7,14 @@ export const VideoComponent = memo(function VideoComponent({ filePath, customDescription, className, + startTime, + endTime, }: { filePath: string; customDescription?: string; className?: string; + startTime?: number; + endTime?: number; }) { const [mediaSrc, setMediaSrc] = useState(null); const [error, setError] = useState(null); @@ -28,10 +32,9 @@ export const VideoComponent = memo(function VideoComponent({ }, []); const renderFileLink = () => ( - // TODO button open link -

+

{customDescription || filePath} -

+
); const getMimeType = (path: string): string => { @@ -58,17 +61,17 @@ export const VideoComponent = memo(function VideoComponent({ console.log("Loading media:", filePath); const sanitizedPath = await sanitizeFilePath(filePath); console.log("Sanitized path:", sanitizedPath); - if (!sanitizedPath) { - throw new Error("Invalid file path"); - } + const mediaData = await readFile(sanitizedPath); + const mimeType = getMimeType(sanitizedPath); + + // Set isAudio based on path check setIsAudio( sanitizedPath.toLowerCase().includes("input") || sanitizedPath.toLowerCase().includes("output") ); - const mediaData = await readFile(sanitizedPath); - const mimeType = getMimeType(sanitizedPath); + // Create blob URL directly const blob = new Blob([mediaData], { type: mimeType }); setMediaSrc(URL.createObjectURL(blob)); } catch (error) { @@ -112,12 +115,11 @@ export const VideoComponent = memo(function VideoComponent({ return (
{isAudio ? ( -
- -
+ ) : (
); }); + +const AudioPlayer = memo(function AudioPlayer({ + startTime, + endTime, + mediaSrc, +}: { + startTime?: number; + endTime?: number; + mediaSrc: string; +}) { + const [duration, setDuration] = useState(0); + const [currentTime, setCurrentTime] = useState(0); + const [isPlaying, setIsPlaying] = useState(false); + const audioRef = useRef(null); + + const audioElement = useMemo( + () => ( + + ), + [mediaSrc, startTime, currentTime] + ); + + const togglePlay = async () => { + if (!audioRef.current) return; + + try { + if (isPlaying) { + audioRef.current.pause(); + } else { + await audioRef.current.play(); + } + setIsPlaying(!isPlaying); + } catch (error) { + console.error("Playback failed:", error); + setIsPlaying(false); + } + }; + + const handleTimeChange = async (e: React.ChangeEvent) => { + if (!audioRef.current) return; + + const time = parseFloat(e.target.value); + const wasPlaying = isPlaying; + + if (wasPlaying) { + audioRef.current.pause(); + } + + // Set the time directly on the audio element first + audioRef.current.currentTime = time; + // Then update the state + setCurrentTime(time); + + if (wasPlaying) { + try { + await audioRef.current.play(); + } catch (error) { + console.error("Playback failed:", error); + setIsPlaying(false); + } + } + }; + + return ( +
+
+ {startTime !== undefined && startTime > 0 && ( +
+
+ Start +
+
+ )} + {endTime !== undefined && endTime < duration - 0.1 && ( +
+
+ End +
+
+ )} + +
+
+
+
+
+ +
+ {audioElement} +
+
+ ); +}); diff --git a/screenpipe-app-tauri/lib/screenpipe.ts b/screenpipe-app-tauri/lib/screenpipe.ts index 731a5f7bf..30a92b25b 100644 --- a/screenpipe-app-tauri/lib/screenpipe.ts +++ b/screenpipe-app-tauri/lib/screenpipe.ts @@ -25,6 +25,8 @@ export type AudioContent = { device_name: string; device_type: string; speaker: Speaker; + start_time?: number; + end_time?: number; }; export type FTSContent = { diff --git a/screenpipe-app-tauri/lib/types/speaker.ts b/screenpipe-app-tauri/lib/types/speaker.ts index 7b5222c04..3d0eb12fd 100644 --- a/screenpipe-app-tauri/lib/types/speaker.ts +++ b/screenpipe-app-tauri/lib/types/speaker.ts @@ -1,6 +1,8 @@ export type AudioSample = { path: string; transcript: string; + startTime?: number; + endTime?: number; }; export interface Speaker { diff --git a/screenpipe-audio/src/stt.rs b/screenpipe-audio/src/stt.rs index a5e86076c..1c36deb0d 100644 --- a/screenpipe-audio/src/stt.rs +++ b/screenpipe-audio/src/stt.rs @@ -169,11 +169,9 @@ pub fn stt_sync( whisper_model: &mut WhisperModel, audio_transcription_engine: Arc, deepgram_api_key: Option, - output_path: &PathBuf, languages: Vec, -) -> Result<(String, String)> { +) -> Result { let mut whisper_model = whisper_model.clone(); - let output_path = output_path.clone(); let audio = audio.to_vec(); let device = device.to_string(); @@ -187,8 +185,6 @@ pub fn stt_sync( &mut whisper_model, audio_transcription_engine, deepgram_api_key, - &output_path, - false, languages, )) }); @@ -301,22 +297,13 @@ fn parse_time_tokens(start: &str, end: &str, min_time: &mut f32, max_time: &mut } pub async fn prepare_segments( - audio_input: &AudioInput, + audio_data: &[f32], vad_engine: Arc>>, segmentation_model_path: &PathBuf, embedding_manager: EmbeddingManager, embedding_extractor: Arc>, + device: &str, ) -> Result> { - let audio_data = if audio_input.sample_rate != m::SAMPLE_RATE as u32 { - resample( - audio_input.data.as_ref(), - audio_input.sample_rate, - m::SAMPLE_RATE as u32, - )? - } else { - audio_input.data.as_ref().to_vec() - }; - let audio_data = normalize_v2(&audio_data); let frame_size = 1600; @@ -352,7 +339,7 @@ pub async fn prepare_segments( info!( "device: {}, speech ratio: {}, min_speech_ratio: {}, audio_frames: {}, speech_frames: {}", - audio_input.device, + device, speech_ratio, min_speech_ratio, audio_frames.len(), @@ -387,10 +374,8 @@ pub async fn stt( whisper_model: &mut WhisperModel, audio_transcription_engine: Arc, deepgram_api_key: Option, - output_path: &PathBuf, - skip_encoding: bool, languages: Vec, -) -> Result<(String, String)> { +) -> Result { let model = &whisper_model.model; debug!("Loading mel filters"); @@ -426,25 +411,7 @@ pub async fn stt( process_with_whisper(&mut *whisper_model, audio, &mel_filters, languages) }; - let new_file_name = Utc::now().format("%Y-%m-%d_%H-%M-%S").to_string(); - let sanitized_device_name = device.replace(['/', '\\'], "_"); - let file_path = PathBuf::from(output_path) - .join(format!("{}_{}.mp4", sanitized_device_name, new_file_name)) - .to_str() - .expect("Failed to create valid path") - .to_string(); - let file_path_clone = file_path.clone(); - // Run FFmpeg in a separate task - if !skip_encoding { - encode_single_audio( - bytemuck::cast_slice(audio), - sample_rate, - 1, - &file_path.into(), - )?; - } - - Ok((transcription?, file_path_clone)) + Ok(transcription?) } pub fn resample(input: &[f32], from_sample_rate: u32, to_sample_rate: u32) -> Result> { @@ -488,6 +455,8 @@ pub struct TranscriptionResult { pub transcription: Option, pub timestamp: u64, pub error: Option, + pub start_time: f64, + pub end_time: f64, } impl TranscriptionResult { @@ -572,14 +541,33 @@ pub async fn create_whisper_channel( crossbeam::select! { recv(input_receiver) -> input_result => { match input_result { - Ok(audio) => { + Ok(mut audio) => { debug!("Received input from input_receiver"); let timestamp = SystemTime::now() .duration_since(UNIX_EPOCH) .expect("Time went backwards") .as_secs(); - let mut segments = match prepare_segments(&audio, vad_engine.clone(), &segmentation_model_path, embedding_manager.clone(), embedding_extractor.clone()).await { + let audio_data = if audio.sample_rate != m::SAMPLE_RATE as u32 { + match resample( + audio.data.as_ref(), + audio.sample_rate, + m::SAMPLE_RATE as u32, + ) { + Ok(data) => data, + Err(e) => { + error!("Error resampling audio: {:?}", e); + continue; + } + } + } else { + audio.data.as_ref().to_vec() + }; + + audio.data = Arc::new(audio_data.clone()); + audio.sample_rate = m::SAMPLE_RATE as u32; + + let mut segments = match prepare_segments(&audio_data, vad_engine.clone(), &segmentation_model_path, embedding_manager.clone(), embedding_extractor.clone(), &audio.device.to_string()).await { Ok(segments) => segments, Err(e) => { error!("Error preparing segments: {:?}", e); @@ -587,14 +575,30 @@ pub async fn create_whisper_channel( } }; + + let path = match write_audio_to_file( + &audio.data.to_vec(), + audio.sample_rate, + &output_path, + &audio.device.to_string(), + false, + ) { + Ok(file_path) => file_path, + Err(e) => { + error!("Error writing audio to file: {:?}", e); + "".to_string() + } + }; + while let Some(segment) = segments.recv().await { + let path = path.clone(); let transcription_result = if cfg!(target_os = "macos") { let timestamp = timestamp + segment.start.round() as u64; #[cfg(target_os = "macos")] { autoreleasepool(|| { - match stt_sync(&segment.samples, segment.sample_rate, &audio.device.to_string(), &mut whisper_model, audio_transcription_engine.clone(), deepgram_api_key.clone(), &output_path, languages.clone()) { - Ok((transcription, path)) => TranscriptionResult { + match stt_sync(&segment.samples, segment.sample_rate, &audio.device.to_string(), &mut whisper_model, audio_transcription_engine.clone(), deepgram_api_key.clone(), languages.clone()) { + Ok(transcription) => TranscriptionResult { input: AudioInput { data: Arc::new(segment.samples), sample_rate: segment.sample_rate, @@ -606,6 +610,8 @@ pub async fn create_whisper_channel( timestamp, error: None, speaker_embedding: segment.embedding.clone(), + start_time: segment.start, + end_time: segment.end, }, Err(e) => { error!("STT error for input {}: {:?}", audio.device, e); @@ -617,10 +623,12 @@ pub async fn create_whisper_channel( device: audio.device.clone(), }, transcription: None, - path: "".to_string(), + path, timestamp, error: Some(e.to_string()), speaker_embedding: Vec::new(), + start_time: segment.start, + end_time: segment.end, } }, } @@ -631,8 +639,8 @@ pub async fn create_whisper_channel( unreachable!("This code should not be reached on non-macOS platforms") } } else { - match stt_sync(&segment.samples, segment.sample_rate, &audio.device.to_string(), &mut whisper_model, audio_transcription_engine.clone(), deepgram_api_key.clone(), &output_path, languages.clone()) { - Ok((transcription, path)) => TranscriptionResult { + match stt_sync(&segment.samples, segment.sample_rate, &audio.device.to_string(), &mut whisper_model, audio_transcription_engine.clone(), deepgram_api_key.clone(), languages.clone()) { + Ok(transcription) => TranscriptionResult { input: AudioInput { data: Arc::new(segment.samples), sample_rate: segment.sample_rate, @@ -644,6 +652,8 @@ pub async fn create_whisper_channel( timestamp, error: None, speaker_embedding: segment.embedding.clone(), + start_time: segment.start, + end_time: segment.end, }, Err(e) => { error!("STT error for input {}: {:?}", audio.device, e); @@ -655,10 +665,12 @@ pub async fn create_whisper_channel( device: audio.device.clone(), }, transcription: None, - path: "".to_string(), + path, timestamp, error: Some(e.to_string()), speaker_embedding: Vec::new(), + start_time: segment.start, + end_time: segment.end, } }, } @@ -723,3 +735,30 @@ pub fn longest_common_word_substring(s1: &str, s2: &str) -> Option<(usize, usize _ => None, } } + +pub fn write_audio_to_file( + audio: &[f32], + sample_rate: u32, + output_path: &PathBuf, + device: &str, + skip_encoding: bool, +) -> Result { + let new_file_name = Utc::now().format("%Y-%m-%d_%H-%M-%S").to_string(); + let sanitized_device_name = device.replace(['/', '\\'], "_"); + let file_path = PathBuf::from(output_path) + .join(format!("{}_{}.mp4", sanitized_device_name, new_file_name)) + .to_str() + .expect("Failed to create valid path") + .to_string(); + let file_path_clone = file_path.clone(); + // Run FFmpeg in a separate task + if !skip_encoding { + encode_single_audio( + bytemuck::cast_slice(audio), + sample_rate, + 1, + &file_path.into(), + )?; + } + Ok(file_path_clone) +} diff --git a/screenpipe-server/src/core.rs b/screenpipe-server/src/core.rs index 24adfcbcc..7710548e8 100644 --- a/screenpipe-server/src/core.rs +++ b/screenpipe-server/src/core.rs @@ -448,7 +448,7 @@ async fn process_audio_result( } } } - match db.insert_audio_chunk(&result.path).await { + match db.get_or_insert_audio_chunk(&result.path).await { Ok(audio_chunk_id) => { if transcription.is_empty() { return Ok(Some(audio_chunk_id)); @@ -462,6 +462,8 @@ async fn process_audio_result( &transcription_engine, &result.input.device, Some(speaker.id), + Some(result.start_time), + Some(result.end_time), ) .await { diff --git a/screenpipe-server/src/db.rs b/screenpipe-server/src/db.rs index ea662d826..ea912a586 100644 --- a/screenpipe-server/src/db.rs +++ b/screenpipe-server/src/db.rs @@ -21,7 +21,7 @@ use tokio::time::{timeout, Duration as TokioDuration}; use zerocopy::AsBytes; use crate::db_types::{ - AudioChunk, AudioEntry, AudioResult, AudioResultRaw, FrameData, OCREntry, OCRResult, + AudioChunksResponse, AudioEntry, AudioResult, AudioResultRaw, FrameData, OCREntry, OCRResult, OCRResultRaw, Speaker, TagContentType, }; use crate::db_types::{ContentType, UiContent}; @@ -111,6 +111,22 @@ impl DatabaseManager { Ok(id) } + async fn get_audio_chunk_id(&self, file_path: &str) -> Result { + let id = sqlx::query_scalar::<_, i64>("SELECT id FROM audio_chunks WHERE file_path = ?1") + .bind(file_path) + .fetch_optional(&self.pool) + .await?; + Ok(id.unwrap_or(0)) + } + + pub async fn get_or_insert_audio_chunk(&self, file_path: &str) -> Result { + let mut id = self.get_audio_chunk_id(file_path).await?; + if id == 0 { + id = self.insert_audio_chunk(file_path).await?; + } + Ok(id) + } + pub async fn insert_audio_transcription( &self, audio_chunk_id: i64, @@ -119,12 +135,14 @@ impl DatabaseManager { transcription_engine: &str, device: &AudioDevice, speaker_id: Option, + start_time: Option, + end_time: Option, ) -> Result { let mut tx = self.pool.begin().await?; // Insert the full transcription let id = sqlx::query( - "INSERT INTO audio_transcriptions (audio_chunk_id, transcription, offset_index, timestamp, transcription_engine, device, is_input_device, speaker_id) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)", + "INSERT INTO audio_transcriptions (audio_chunk_id, transcription, offset_index, timestamp, transcription_engine, device, is_input_device, speaker_id, start_time, end_time) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10)", ) .bind(audio_chunk_id) .bind(transcription) @@ -134,6 +152,8 @@ impl DatabaseManager { .bind(&device.name) .bind(device.device_type == DeviceType::Input) .bind(speaker_id) + .bind(start_time) + .bind(end_time) .execute(&mut *tx) .await? .last_insert_rowid(); @@ -814,7 +834,9 @@ impl DatabaseManager { GROUP_CONCAT(tags.name, ',') as tags, audio_transcriptions.device as device_name, audio_transcriptions.is_input_device, - audio_transcriptions.speaker_id + audio_transcriptions.speaker_id, + audio_transcriptions.start_time, + audio_transcriptions.end_time FROM {} JOIN audio_chunks ON audio_transcriptions.audio_chunk_id = audio_chunks.id LEFT JOIN speakers on audio_transcriptions.speaker_id = speakers.id @@ -873,6 +895,8 @@ impl DatabaseManager { DeviceType::Output }, speaker, + start_time: raw.start_time, + end_time: raw.end_time, }) }); @@ -944,7 +968,7 @@ impl DatabaseManager { ContentType::Audio => { format!( r#" - SELECT COUNT(DISTINCT audio_transcriptions.audio_chunk_id) + SELECT COUNT(DISTINCT audio_transcriptions.audio_chunk_id || '_' || COALESCE(audio_transcriptions.start_time, '') || '_' || COALESCE(audio_transcriptions.end_time, '')) FROM audio_transcriptions_fts JOIN audio_transcriptions ON audio_transcriptions_fts.audio_chunk_id = audio_transcriptions.audio_chunk_id WHERE {} @@ -1527,13 +1551,23 @@ impl DatabaseManager { pub async fn get_audio_chunks_for_speaker( &self, speaker_id: i64, - ) -> Result, sqlx::Error> { - sqlx::query_as::<_, AudioChunk>( - "SELECT * FROM audio_chunks WHERE id IN (SELECT audio_chunk_id FROM audio_transcriptions WHERE speaker_id = ?)", + ) -> Result, sqlx::Error> { + sqlx::query_as::<_, AudioChunksResponse>( + r#" + SELECT + ac.*, + at.start_time, + at.end_time, + ac.file_path + FROM audio_chunks ac + JOIN audio_transcriptions at ON ac.id = at.audio_chunk_id + WHERE at.speaker_id = ? + ORDER BY at.start_time + "#, ) - .bind(speaker_id) - .fetch_all(&self.pool) - .await + .bind(speaker_id) + .fetch_all(&self.pool) + .await } // get unnamed speakers @@ -1548,7 +1582,9 @@ impl DatabaseManager { SELECT DISTINCT s.id as speaker_id, ac.file_path, - at.transcription + at.transcription, + at.start_time, + at.end_time FROM speakers s JOIN audio_transcriptions at ON s.id = at.speaker_id JOIN audio_chunks ac ON at.audio_chunk_id = ac.id @@ -1583,7 +1619,9 @@ impl DatabaseManager { THEN json_object('audio_samples', json_group_array( DISTINCT json_object( 'path', rap.file_path, - 'transcript', rap.transcription + 'transcript', rap.transcription, + 'start_time', rap.start_time, + 'end_time', rap.end_time ) )) ELSE json_patch( @@ -1591,7 +1629,9 @@ impl DatabaseManager { json_object('audio_samples', json_group_array( DISTINCT json_object( 'path', rap.file_path, - 'transcript', rap.transcription + 'transcript', rap.transcription, + 'start_time', rap.start_time, + 'end_time', rap.end_time ) )) ) @@ -1674,7 +1714,7 @@ impl DatabaseManager { "audio transcriptions", ), ( - "DELETE FROM audio_chunks WHERE id IN (SELECT audio_chunk_id FROM audio_transcriptions WHERE speaker_id = ?)", + "DELETE FROM audio_chunks WHERE id IN (SELECT audio_chunk_id FROM audio_transcriptions WHERE speaker_id = ? AND start_time IS NULL)", "audio chunks", ), ( @@ -1719,7 +1759,9 @@ impl DatabaseManager { SELECT DISTINCT s.id as speaker_id, ac.file_path, - at.transcription + at.transcription, + at.start_time, + at.end_time FROM speakers s JOIN audio_transcriptions at ON s.id = at.speaker_id JOIN audio_chunks ac ON at.audio_chunk_id = ac.id @@ -1742,13 +1784,17 @@ impl DatabaseManager { WHEN s.metadata = '' OR s.metadata IS NULL OR json_valid(s.metadata) = 0 THEN json_object('audio_samples', json_group_array(DISTINCT json_object( 'path', rap.file_path, - 'transcript', rap.transcription + 'transcript', rap.transcription, + 'start_time', rap.start_time, + 'end_time', rap.end_time ))) ELSE json_patch( json(s.metadata), json_object('audio_samples', json_group_array(DISTINCT json_object( 'path', rap.file_path, - 'transcript', rap.transcription + 'transcript', rap.transcription, + 'start_time', rap.start_time, + 'end_time', rap.end_time ))) ) END as metadata diff --git a/screenpipe-server/src/db_types.rs b/screenpipe-server/src/db_types.rs index 5459ce4c4..f3b3b0b1d 100644 --- a/screenpipe-server/src/db_types.rs +++ b/screenpipe-server/src/db_types.rs @@ -82,6 +82,8 @@ pub struct AudioResultRaw { pub device_name: String, pub is_input_device: bool, pub speaker_id: Option, + pub start_time: Option, + pub end_time: Option, } #[derive(Debug, Serialize, Deserialize, FromRow, Clone)] @@ -103,6 +105,8 @@ pub struct AudioResult { pub device_name: String, pub device_type: DeviceType, pub speaker: Option, + pub start_time: Option, + pub end_time: Option, } #[derive(Debug, Deserialize, PartialEq)] @@ -181,3 +185,12 @@ pub struct AudioChunk { pub file_path: String, pub timestamp: DateTime, } + +#[derive(Debug, FromRow)] +pub struct AudioChunksResponse { + pub audio_chunk_id: i64, + pub start_time: Option, + pub end_time: Option, + pub file_path: String, + pub timestamp: DateTime, +} diff --git a/screenpipe-server/src/migrations/20241213220649_create_segment_start_time_and_end_time_columns.sql b/screenpipe-server/src/migrations/20241213220649_create_segment_start_time_and_end_time_columns.sql new file mode 100644 index 000000000..296b99ea6 --- /dev/null +++ b/screenpipe-server/src/migrations/20241213220649_create_segment_start_time_and_end_time_columns.sql @@ -0,0 +1,67 @@ +-- Add migration script here +ALTER TABLE audio_transcriptions ADD COLUMN start_time REAL; +ALTER TABLE audio_transcriptions ADD COLUMN end_time REAL; + +PRAGMA foreign_keys = OFF; + +-- Drop existing triggers and FTS tables +DROP TRIGGER IF EXISTS audio_transcriptions_ai; +DROP TRIGGER IF EXISTS audio_transcriptions_update; +DROP TRIGGER IF EXISTS audio_transcriptions_delete; +DROP TABLE IF EXISTS audio_transcriptions_fts; + +CREATE VIRTUAL TABLE IF NOT EXISTS audio_transcriptions_fts USING fts5( + transcription, + device, + audio_chunk_id UNINDEXED, + speaker_id, + start_time UNINDEXED, + end_time UNINDEXED, + tokenize='unicode61' +); + +INSERT OR IGNORE INTO audio_transcriptions_fts(transcription, device, audio_chunk_id, speaker_id, start_time, end_time) +SELECT + COALESCE(transcription, '') as transcription, + COALESCE(device, '') as device, + audio_chunk_id, + speaker_id, + start_time, + end_time +FROM audio_transcriptions +WHERE transcription IS NOT NULL + AND transcription != '' + AND audio_chunk_id IS NOT NULL; + +CREATE TRIGGER IF NOT EXISTS audio_transcriptions_ai AFTER INSERT ON audio_transcriptions +WHEN NEW.transcription IS NOT NULL AND NEW.transcription != '' AND NEW.audio_chunk_id IS NOT NULL +BEGIN + INSERT OR IGNORE INTO audio_transcriptions_fts(transcription, device, audio_chunk_id, speaker_id, start_time, end_time) + VALUES ( + NEW.transcription, + COALESCE(NEW.device, ''), + NEW.audio_chunk_id, + NEW.speaker_id, + NEW.start_time, + NEW.end_time + ); +END; + +CREATE TRIGGER IF NOT EXISTS audio_transcriptions_update AFTER UPDATE ON audio_transcriptions +WHEN NEW.transcription IS NOT NULL AND NEW.transcription != '' AND OLD.audio_chunk_id IS NOT NULL +BEGIN + UPDATE audio_transcriptions_fts + SET transcription = NEW.transcription, + device = COALESCE(NEW.device, ''), + start_time = NEW.start_time, + end_time = NEW.end_time + WHERE audio_chunk_id = OLD.audio_chunk_id; +END; + +CREATE TRIGGER IF NOT EXISTS audio_transcriptions_delete AFTER DELETE ON audio_transcriptions +BEGIN + DELETE FROM audio_transcriptions_fts + WHERE audio_chunk_id = OLD.audio_chunk_id; +END; + +PRAGMA foreign_keys = ON; \ No newline at end of file diff --git a/screenpipe-server/src/server.rs b/screenpipe-server/src/server.rs index df197e3ed..b00eceda6 100644 --- a/screenpipe-server/src/server.rs +++ b/screenpipe-server/src/server.rs @@ -185,6 +185,8 @@ pub struct AudioContent { pub device_name: String, pub device_type: DeviceType, pub speaker: Option, + pub start_time: Option, + pub end_time: Option, } #[derive(Serialize, Deserialize, Debug)] @@ -338,6 +340,8 @@ pub(crate) async fn search( device_name: audio.device_name.clone(), device_type: audio.device_type.clone(), speaker: audio.speaker.clone(), + start_time: audio.start_time, + end_time: audio.end_time, }), SearchResult::UI(ui) => ContentItem::UI(UiContent { id: ui.id, @@ -521,7 +525,7 @@ pub async fn health_check(State(state): State>) -> JsonResponse) { "test_engine", &AudioDevice::new("test".to_string(), DeviceType::Output), None, + None, + None, ) .await .unwrap(); diff --git a/screenpipe-vision/bin/ui_monitor-aarch64-apple-darwin b/screenpipe-vision/bin/ui_monitor-aarch64-apple-darwin index 1546bce73..27dc6d78e 100755 Binary files a/screenpipe-vision/bin/ui_monitor-aarch64-apple-darwin and b/screenpipe-vision/bin/ui_monitor-aarch64-apple-darwin differ diff --git a/screenpipe-vision/lib/libscreenpipe_arm64.dylib b/screenpipe-vision/lib/libscreenpipe_arm64.dylib index 817087c34..d936a8fe0 100755 Binary files a/screenpipe-vision/lib/libscreenpipe_arm64.dylib and b/screenpipe-vision/lib/libscreenpipe_arm64.dylib differ diff --git a/screenpipe-vision/src/core.rs b/screenpipe-vision/src/core.rs index 3dea68919..b0c46743a 100644 --- a/screenpipe-vision/src/core.rs +++ b/screenpipe-vision/src/core.rs @@ -296,3 +296,4 @@ pub fn trigger_screen_capture_permission() -> Result<()> { Ok(()) } + \ No newline at end of file