Skip to content

Commit

Permalink
add pdf-extract loader (#202)
Browse files Browse the repository at this point in the history
  • Loading branch information
prabirshrestha authored Aug 17, 2024
1 parent 5a6160c commit 54fbe26
Show file tree
Hide file tree
Showing 7 changed files with 152 additions and 9 deletions.
4 changes: 3 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ text-splitter = { version = "0.15", features = ["tiktoken-rs", "markdown"] }
surrealdb = { version = "1.4.2", optional = true, default-features = false }
csv = "1.3.0"
urlencoding = "2.1.3"
lopdf = { version = "0.33.0", features = ["pom", "pom_parser"], optional = true }
lopdf = { version = "0.32.0", features = ["nom_parser"], optional = true }
pdf-extract = { version = "0.7.7", optional = true }
thiserror = "1.0.59"
futures-util = "0.3.30"
async-stream = "0.3.5"
Expand Down Expand Up @@ -85,6 +86,7 @@ fastembed = ["dep:fastembed"]
git = ["gix", "flume"]
mistralai = ["mistralai-client"]
lopdf = ["dep:lopdf"]
pdf-extract = ["dep:lopdf", "dep:pdf-extract"]
ollama = ["ollama-rs"]
opensearch = ["dep:opensearch", "aws-config"]
postgres = ["pgvector", "sqlx", "uuid"]
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,8 @@ This is the Rust language implementation of [LangChain](https://github.com/langc
async fn main() {
let path = "./src/document_loaders/test_data/sample.pdf";

let loader = LoPdfLoader::from_path(path).expect("Failed to create PdfLoader");
let loader = PdfExtractLoader::from_path(path).expect("Failed to create PdfExtractLoader");
// let loader = LoPdfLoader::from_path(path).expect("Failed to create LoPdfLoader");

let docs = loader
.load()
Expand Down
9 changes: 8 additions & 1 deletion src/document_loaders/error.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::io;
use std::{io, string::FromUtf8Error};

use thiserror::Error;

Expand All @@ -15,13 +15,20 @@ pub enum LoaderError {
#[error(transparent)]
IOError(#[from] io::Error),

#[error(transparent)]
FromUtf8Error(#[from] FromUtf8Error),

#[error(transparent)]
CSVError(#[from] csv::Error),

#[cfg(feature = "lopdf")]
#[error(transparent)]
LoPdfError(#[from] lopdf::Error),

#[cfg(feature = "pdf-extract")]
#[error(transparent)]
PdfExtractOutputError(#[from] pdf_extract::OutputError),

#[error(transparent)]
ReadabilityError(#[from] readability::error::Error),

Expand Down
4 changes: 2 additions & 2 deletions src/document_loaders/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ pub use git_commit_loader::*;
mod pandoc_loader;
pub use pandoc_loader::*;

#[cfg(feature = "lopdf")]
#[cfg(any(feature = "lopdf", feature = "pdf_extract"))]

Check warning on line 18 in src/document_loaders/mod.rs

View workflow job for this annotation

GitHub Actions / build

unexpected `cfg` condition value: `pdf_extract`

Check warning on line 18 in src/document_loaders/mod.rs

View workflow job for this annotation

GitHub Actions / build

unexpected `cfg` condition value: `pdf_extract`
mod pdf_loader;
#[cfg(feature = "lopdf")]
#[cfg(any(feature = "lopdf", feature = "pdf_extract"))]

Check warning on line 20 in src/document_loaders/mod.rs

View workflow job for this annotation

GitHub Actions / build

unexpected `cfg` condition value: `pdf_extract`

Check warning on line 20 in src/document_loaders/mod.rs

View workflow job for this annotation

GitHub Actions / build

unexpected `cfg` condition value: `pdf_extract`
pub use pdf_loader::*;

mod html_loader;
Expand Down
8 changes: 4 additions & 4 deletions src/document_loaders/pdf_loader/lo_loader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ impl LoPdfLoader {
/// ```rust,ignore
/// use std::io::Cursor;
/// let data = Cursor::new(vec![...] /* some PDF data */);
/// let loader = PdfLoader::new(data)?;
/// let loader = LoPdfLoader::new(data)?;
/// ```
///
pub fn new<R: Read>(reader: R) -> Result<Self, LoaderError> {
Expand All @@ -38,7 +38,7 @@ impl LoPdfLoader {
/// # Example
///
/// ```rust,ignore
/// let loader = PdfLoader::from_path("/path/to/my.pdf")?;
/// let loader = LoPdfLoader::from_path("/path/to/my.pdf")?;
/// ```
///
pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self, LoaderError> {
Expand Down Expand Up @@ -96,7 +96,7 @@ mod tests {
async fn test_lo_pdf_loader() {
let path = "./src/document_loaders/test_data/sample.pdf";

let loader = LoPdfLoader::from_path(path).expect("Failed to create PdfLoader");
let loader = LoPdfLoader::from_path(path).expect("Failed to create LoPdfLoader");

let docs = loader
.load()
Expand All @@ -121,7 +121,7 @@ mod tests {
file.read_to_end(&mut buffer).unwrap();
let reader = Cursor::new(buffer);

let loader = LoPdfLoader::new(reader).expect("Failed to create PdfLoader");
let loader = LoPdfLoader::new(reader).expect("Failed to create LoPdfLoader");

let docs = loader
.load()
Expand Down
3 changes: 3 additions & 0 deletions src/document_loaders/pdf_loader/mod.rs
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
#[cfg(feature = "lopdf")]
pub mod lo_loader;

#[cfg(feature = "pdf-extract")]
pub mod pdf_extract_loader;
130 changes: 130 additions & 0 deletions src/document_loaders/pdf_loader/pdf_extract_loader.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
use std::{io::Read, path::Path, pin::Pin};

use async_stream::stream;
use async_trait::async_trait;
use futures::Stream;
use pdf_extract::{output_doc, PlainTextOutput};

use crate::{
document_loaders::{process_doc_stream, Loader, LoaderError},
schemas::Document,
text_splitter::TextSplitter,
};

#[derive(Debug, Clone)]
pub struct PdfExtractLoader {
document: lopdf::Document,
}

impl PdfExtractLoader {
/// Creates a new PdfLoader from anything that implements the Read trait.
/// This is a generic constructor which can be used with any type of reader.
///
/// # Example
///
/// ```rust,ignore
/// use std::io::Cursor;
/// let data = Cursor::new(vec![...] /* some PDF data */);
/// let loader = PdfExtractLoader::new(data)?;
/// ```
///
pub fn new<R: Read>(reader: R) -> Result<Self, LoaderError> {
let document = lopdf::Document::load_from(reader)?;
Ok(Self { document })
}
/// Creates a new PdfLoader from a path to a PDF file.
/// This loads the PDF document and creates a PdfLoader from it.
///
/// # Example
///
/// ```rust,ignore
/// let loader = PdfExtractLoader::from_path("/path/to/my.pdf")?;
/// ```
///
pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self, LoaderError> {
let document = lopdf::Document::load(path)?;
Ok(Self { document })
}
}

#[async_trait]
impl Loader for PdfExtractLoader {
async fn load(
mut self,
) -> Result<
Pin<Box<dyn Stream<Item = Result<Document, LoaderError>> + Send + 'static>>,
LoaderError,
> {
let mut buffer: Vec<u8> = Vec::new();
let mut output = PlainTextOutput::new(&mut buffer as &mut dyn std::io::Write);
output_doc(&self.document, &mut output)?;
let doc = Document::new(String::from_utf8(buffer)?);

let stream = stream! {
yield Ok(doc);
};

Ok(Box::pin(stream))
}

async fn load_and_split<TS: TextSplitter + 'static>(
mut self,
splitter: TS,
) -> Result<
Pin<Box<dyn Stream<Item = Result<Document, LoaderError>> + Send + 'static>>,
LoaderError,
> {
let doc_stream = self.load().await?;
let stream = process_doc_stream(doc_stream, splitter).await;
Ok(Box::pin(stream))
}
}

#[cfg(test)]
mod tests {
use std::{fs::File, io::Cursor};

use futures_util::StreamExt;

use super::*;

#[tokio::test]
async fn test_lo_pdf_loader() {
let path = "./src/document_loaders/test_data/sample.pdf";

let loader = PdfExtractLoader::from_path(path).expect("Failed to create PdfExtractLoader");

let docs = loader
.load()
.await
.unwrap()
.map(|d| d.unwrap())
.collect::<Vec<_>>()
.await;

assert_eq!(&docs[0].page_content[..100], "\n\nSample PDF Document\n\nRobert Maron\nGrzegorz Grudzi´nski\n\nFebruary 20, 1999\n\n2\n\nContents\n\n1 Templat");
assert_eq!(docs.len(), 1);
}

#[tokio::test]
async fn test_lo_pdf_loader_reader() {
let path = "./src/document_loaders/test_data/sample.pdf";
let mut file = File::open(path).unwrap();
let mut buffer = Vec::new();
file.read_to_end(&mut buffer).unwrap();
let reader = Cursor::new(buffer);

let loader = PdfExtractLoader::new(reader).expect("Failed to create PdfExtractLoader");

let docs = loader
.load()
.await
.unwrap()
.map(|d| d.unwrap())
.collect::<Vec<_>>()
.await;

assert_eq!(&docs[0].page_content[..100], "\n\nSample PDF Document\n\nRobert Maron\nGrzegorz Grudzi´nski\n\nFebruary 20, 1999\n\n2\n\nContents\n\n1 Templat");
assert_eq!(docs.len(), 1);
}
}

0 comments on commit 54fbe26

Please sign in to comment.