Abraxas-365 · prabirshrestha · Oct 12, 2024 · Oct 11, 2024
diff --git a/.gitignore b/.gitignore
@@ -3,4 +3,7 @@ Cargo.lock
 .DS_Store
 .fastembed_cache
 .vscode
-.idea
+.idea
+
+# Ignore files generated by text_to_speech example
+*.mp3
diff --git a/examples/text_to_speech.rs b/examples/text_to_speech.rs
@@ -0,0 +1,147 @@
+use std::{io::Cursor, process::Stdio};
+
+use futures::StreamExt;
+use langchain_rust::{
+    document_loaders::{HtmlLoader, Loader},
+    schemas::Document,
+    text_splitter::{PlainTextSplitter, PlainTextSplitterOptions, TextSplitter},
+    tools::{Text2SpeechOpenAI, Tool},
+};
+use tokio::{io::AsyncReadExt, process::Command};
+use url::Url;
+
+#[tokio::main]
+async fn main() {
+    // URL to generate audio from.
+    let url = "https://en.m.wikivoyage.org/wiki/Seoul";
+    let output_path = "output.mp3";
+
+    // Use reqwest to fetch the raw HTML content.
+    println!("Fetching URL: {}\n", url);
+    let html = reqwest::get(url).await.unwrap().text().await.unwrap();
+
+    // Use HtmlLoader to load the HTML content and extract plain text without html tags.
+    let html_loader = HtmlLoader::new(Cursor::new(html), Url::parse(url).unwrap());
+    let documents: Vec<Document> = html_loader
+        .load()
+        .await
+        .unwrap()
+        .map(|x| x.unwrap())
+        .collect()
+        .await;
+
+    // Since OpenAI has limits for input text size, use PlainTextSplitter to split the text into
+    // chunks that are acceptable by OpenAI.
+    let splitter = PlainTextSplitter::new(
+        PlainTextSplitterOptions::default()
+            // NOTE: PlainTextSplitter doesn't handle unicode chars, so make
+            // sure to put some buffer if you are using unicode characters.
+            .with_chunk_size(3000)
+            .with_chunk_overlap(0)
+            .with_trim_chunks(true),
+    );
+    let text_chunks = splitter
+        .split_documents(&documents)
+        .await
+        .unwrap()
+        .into_iter()
+        .take(2) // Take only 2 for now to save time and cost as example.
+        .collect::<Vec<Document>>();
+
+    // Loop through each text chunks and generate audio using OpenAI and save it to disk.
+    for (i, chunk) in text_chunks.iter().enumerate() {
+        println!(
+            "Processing chunk {} of {} with chunk size {}: \n{}\n",
+            i,
+            text_chunks.len(),
+            chunk.page_content.len(),
+            &chunk.page_content
+        );
+
+        let openai = Text2SpeechOpenAI::default().with_path(format!("chunk_{}.mp3", i));
+        let path = openai.call(&chunk.page_content).await.unwrap();
+
+        let path = std::path::Path::new(&path).canonicalize().unwrap();
+        println!("Chunk file saved at: {:?}\n\n", path);
+    }
+
+    // Use ffmpeg to concatenate all the audio chunks into a single audio file.
+    // ffmpeg -hide_banner -i "concat:chunk_0.mp3|chunk_1.mp3" -acodec copy -y output.mp3
+    let mut args = vec![];
+
+    let chunks_paths_list = text_chunks
+        .iter()
+        .enumerate()
+        .map(|(i, _)| format!("chunk_{}.mp3", i))
+        .collect::<Vec<String>>()
+        .join("|");
+
+    args.extend_from_slice(&[
+        "-hide_banner".into(),
+        "-i".into(),
+        format!("concat:{}", &chunks_paths_list),
+        "-acodec".into(),
+        "copy".into(),
+        "-y".into(), // overwite output file
+        output_path.into(),
+    ]);
+
+    println!(
+        "Merging {} audio chunks using: ffmpeg {}\n",
+        text_chunks.len(),
+        &args.join(" ")
+    );
+
+    let mut child = Command::new("ffmpeg")
+        .args(args)
+        .stdout(Stdio::piped())
+        .stderr(Stdio::piped())
+        .spawn()
+        .expect("Failed to start ffmpeg process.");
+
+    let mut stdout = child.stdout.take().expect("Failed to open stdout");
+    let mut stderr = child.stderr.take().expect("Failed to open stderr");
+
+    let stdout_handle = tokio::spawn(async move {
+        let mut buffer = vec![0; 1024];
+        while let Ok(size) = stdout.read(&mut buffer).await {
+            if size == 0 {
+                break;
+            }
+            let output = String::from_utf8_lossy(&buffer[..size]);
+            print!("FFmpeg STDOUT: {}", output);
+        }
+    });
+
+    let stderr_handle = tokio::spawn(async move {
+        let mut buffer = vec![0; 1024];
+        while let Ok(size) = stderr.read(&mut buffer).await {
+            if size == 0 {
+                break;
+            }
+            let error = String::from_utf8_lossy(&buffer[..size]);
+            eprint!("FFmpeg STDERR: {}", error);
+        }
+    });
+
+    let ffmpeg_exit_status = child.wait().await.unwrap();
+    stdout_handle.await.unwrap();
+    stderr_handle.await.unwrap();
+
+    println!(
+        "FFmpeg process finished with exit status {}",
+        ffmpeg_exit_status
+    );
+
+    println!("Cleaning up intermediate audio chunk files...");
+    for (i, _) in text_chunks.iter().enumerate() {
+        let path = std::path::Path::new(&format!("chunk_{}.mp3", i))
+            .canonicalize()
+            .unwrap();
+        tokio::fs::remove_file(path).await.unwrap();
+    }
+    println!("Cleaning up intermediate audio chunk files complete.");
+
+    let path = std::path::Path::new(&output_path).canonicalize().unwrap();
+    println!("Final audio saved at: {:?}", path);
+}
diff --git a/src/text_splitter/mod.rs b/src/text_splitter/mod.rs
@@ -1,11 +1,13 @@
 mod error;
 mod markdown_splitter;
 mod options;
+mod plain_text_splitter;
 mod text_splitter;
 mod token_splitter;
 
 pub use error::*;
 pub use markdown_splitter::*;
 pub use options::*;
+pub use plain_text_splitter::*;
 pub use text_splitter::*;
 pub use token_splitter::*;
diff --git a/src/text_splitter/plain_text_splitter.rs b/src/text_splitter/plain_text_splitter.rs
@@ -0,0 +1,85 @@
+use async_trait::async_trait;
+
+use super::{TextSplitter, TextSplitterError};
+
+// Options is a struct that contains options for a plain text splitter.
+#[derive(Debug, Clone)]
+pub struct PlainTextSplitterOptions {
+    pub chunk_size: usize,
+    pub chunk_overlap: usize,
+    pub trim_chunks: bool,
+}
+
+impl Default for PlainTextSplitterOptions {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl PlainTextSplitterOptions {
+    pub fn new() -> Self {
+        PlainTextSplitterOptions {
+            chunk_size: 512,
+            chunk_overlap: 0,
+            trim_chunks: false,
+        }
+    }
+
+    pub fn with_chunk_size(mut self, chunk_size: usize) -> Self {
+        self.chunk_size = chunk_size;
+        self
+    }
+
+    pub fn with_chunk_overlap(mut self, chunk_overlap: usize) -> Self {
+        self.chunk_overlap = chunk_overlap;
+        self
+    }
+
+    pub fn with_trim_chunks(mut self, trim_chunks: bool) -> Self {
+        self.trim_chunks = trim_chunks;
+        self
+    }
+
+    pub fn chunk_size(&self) -> usize {
+        self.chunk_size
+    }
+
+    pub fn chunk_overlap(&self) -> usize {
+        self.chunk_overlap
+    }
+
+    pub fn trim_chunks(&self) -> bool {
+        self.trim_chunks
+    }
+}
+
+pub struct PlainTextSplitter {
+    splitter_options: PlainTextSplitterOptions,
+}
+
+impl Default for PlainTextSplitter {
+    fn default() -> Self {
+        PlainTextSplitter::new(PlainTextSplitterOptions::default())
+    }
+}
+
+impl PlainTextSplitter {
+    pub fn new(options: PlainTextSplitterOptions) -> PlainTextSplitter {
+        PlainTextSplitter {
+            splitter_options: options,
+        }
+    }
+}
+
+#[async_trait]
+impl TextSplitter for PlainTextSplitter {
+    async fn split_text(&self, text: &str) -> Result<Vec<String>, TextSplitterError> {
+        let splitter = text_splitter::TextSplitter::new(
+            text_splitter::ChunkConfig::new(self.splitter_options.chunk_size)
+                .with_trim(self.splitter_options.trim_chunks)
+                .with_overlap(self.splitter_options.chunk_overlap)?,
+        );
+
+        Ok(splitter.chunks(text).map(|x| x.to_string()).collect())
+    }
+}