Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add PlainTextSplitter and text to speech example #249

Merged
merged 1 commit into from
Oct 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,7 @@ Cargo.lock
.DS_Store
.fastembed_cache
.vscode
.idea
.idea

# Ignore files generated by text_to_speech example
*.mp3
147 changes: 147 additions & 0 deletions examples/text_to_speech.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
use std::{io::Cursor, process::Stdio};

use futures::StreamExt;
use langchain_rust::{
document_loaders::{HtmlLoader, Loader},
schemas::Document,
text_splitter::{PlainTextSplitter, PlainTextSplitterOptions, TextSplitter},
tools::{Text2SpeechOpenAI, Tool},
};
use tokio::{io::AsyncReadExt, process::Command};
use url::Url;

#[tokio::main]
async fn main() {
// URL to generate audio from.
let url = "https://en.m.wikivoyage.org/wiki/Seoul";
let output_path = "output.mp3";

// Use reqwest to fetch the raw HTML content.
println!("Fetching URL: {}\n", url);
let html = reqwest::get(url).await.unwrap().text().await.unwrap();

// Use HtmlLoader to load the HTML content and extract plain text without html tags.
let html_loader = HtmlLoader::new(Cursor::new(html), Url::parse(url).unwrap());
let documents: Vec<Document> = html_loader
.load()
.await
.unwrap()
.map(|x| x.unwrap())
.collect()
.await;

// Since OpenAI has limits for input text size, use PlainTextSplitter to split the text into
// chunks that are acceptable by OpenAI.
let splitter = PlainTextSplitter::new(
PlainTextSplitterOptions::default()
// NOTE: PlainTextSplitter doesn't handle unicode chars, so make
// sure to put some buffer if you are using unicode characters.
.with_chunk_size(3000)
.with_chunk_overlap(0)
.with_trim_chunks(true),
);
let text_chunks = splitter
.split_documents(&documents)
.await
.unwrap()
.into_iter()
.take(2) // Take only 2 for now to save time and cost as example.
.collect::<Vec<Document>>();

// Loop through each text chunks and generate audio using OpenAI and save it to disk.
for (i, chunk) in text_chunks.iter().enumerate() {
println!(
"Processing chunk {} of {} with chunk size {}: \n{}\n",
i,
text_chunks.len(),
chunk.page_content.len(),
&chunk.page_content
);

let openai = Text2SpeechOpenAI::default().with_path(format!("chunk_{}.mp3", i));
let path = openai.call(&chunk.page_content).await.unwrap();

let path = std::path::Path::new(&path).canonicalize().unwrap();
println!("Chunk file saved at: {:?}\n\n", path);
}

// Use ffmpeg to concatenate all the audio chunks into a single audio file.
// ffmpeg -hide_banner -i "concat:chunk_0.mp3|chunk_1.mp3" -acodec copy -y output.mp3
let mut args = vec![];

let chunks_paths_list = text_chunks
.iter()
.enumerate()
.map(|(i, _)| format!("chunk_{}.mp3", i))
.collect::<Vec<String>>()
.join("|");

args.extend_from_slice(&[
"-hide_banner".into(),
"-i".into(),
format!("concat:{}", &chunks_paths_list),
"-acodec".into(),
"copy".into(),
"-y".into(), // overwite output file
output_path.into(),
]);

println!(
"Merging {} audio chunks using: ffmpeg {}\n",
text_chunks.len(),
&args.join(" ")
);

let mut child = Command::new("ffmpeg")
.args(args)
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.expect("Failed to start ffmpeg process.");

let mut stdout = child.stdout.take().expect("Failed to open stdout");
let mut stderr = child.stderr.take().expect("Failed to open stderr");

let stdout_handle = tokio::spawn(async move {
let mut buffer = vec![0; 1024];
while let Ok(size) = stdout.read(&mut buffer).await {
if size == 0 {
break;
}
let output = String::from_utf8_lossy(&buffer[..size]);
print!("FFmpeg STDOUT: {}", output);
}
});

let stderr_handle = tokio::spawn(async move {
let mut buffer = vec![0; 1024];
while let Ok(size) = stderr.read(&mut buffer).await {
if size == 0 {
break;
}
let error = String::from_utf8_lossy(&buffer[..size]);
eprint!("FFmpeg STDERR: {}", error);
}
});

let ffmpeg_exit_status = child.wait().await.unwrap();
stdout_handle.await.unwrap();
stderr_handle.await.unwrap();

println!(
"FFmpeg process finished with exit status {}",
ffmpeg_exit_status
);

println!("Cleaning up intermediate audio chunk files...");
for (i, _) in text_chunks.iter().enumerate() {
let path = std::path::Path::new(&format!("chunk_{}.mp3", i))
.canonicalize()
.unwrap();
tokio::fs::remove_file(path).await.unwrap();
}
println!("Cleaning up intermediate audio chunk files complete.");

let path = std::path::Path::new(&output_path).canonicalize().unwrap();
println!("Final audio saved at: {:?}", path);
}
2 changes: 2 additions & 0 deletions src/text_splitter/mod.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
mod error;
mod markdown_splitter;
mod options;
mod plain_text_splitter;
mod text_splitter;
mod token_splitter;

pub use error::*;
pub use markdown_splitter::*;
pub use options::*;
pub use plain_text_splitter::*;
pub use text_splitter::*;
pub use token_splitter::*;
85 changes: 85 additions & 0 deletions src/text_splitter/plain_text_splitter.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
use async_trait::async_trait;

use super::{TextSplitter, TextSplitterError};

// Options is a struct that contains options for a plain text splitter.
#[derive(Debug, Clone)]
pub struct PlainTextSplitterOptions {
pub chunk_size: usize,
pub chunk_overlap: usize,
pub trim_chunks: bool,
}

impl Default for PlainTextSplitterOptions {
fn default() -> Self {
Self::new()
}
}

impl PlainTextSplitterOptions {
pub fn new() -> Self {
PlainTextSplitterOptions {
chunk_size: 512,
chunk_overlap: 0,
trim_chunks: false,
}
}

pub fn with_chunk_size(mut self, chunk_size: usize) -> Self {
self.chunk_size = chunk_size;
self
}

pub fn with_chunk_overlap(mut self, chunk_overlap: usize) -> Self {
self.chunk_overlap = chunk_overlap;
self
}

pub fn with_trim_chunks(mut self, trim_chunks: bool) -> Self {
self.trim_chunks = trim_chunks;
self
}

pub fn chunk_size(&self) -> usize {
self.chunk_size
}

pub fn chunk_overlap(&self) -> usize {
self.chunk_overlap
}

pub fn trim_chunks(&self) -> bool {
self.trim_chunks
}
}

pub struct PlainTextSplitter {
splitter_options: PlainTextSplitterOptions,
}

impl Default for PlainTextSplitter {
fn default() -> Self {
PlainTextSplitter::new(PlainTextSplitterOptions::default())
}
}

impl PlainTextSplitter {
pub fn new(options: PlainTextSplitterOptions) -> PlainTextSplitter {
PlainTextSplitter {
splitter_options: options,
}
}
}

#[async_trait]
impl TextSplitter for PlainTextSplitter {
async fn split_text(&self, text: &str) -> Result<Vec<String>, TextSplitterError> {
let splitter = text_splitter::TextSplitter::new(
text_splitter::ChunkConfig::new(self.splitter_options.chunk_size)
.with_trim(self.splitter_options.trim_chunks)
.with_overlap(self.splitter_options.chunk_overlap)?,
);

Ok(splitter.chunks(text).map(|x| x.to_string()).collect())
}
}
Loading