-
Notifications
You must be signed in to change notification settings - Fork 99
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
5a6160c
commit 54fbe26
Showing
7 changed files
with
152 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,5 @@ | ||
#[cfg(feature = "lopdf")] | ||
pub mod lo_loader; | ||
|
||
#[cfg(feature = "pdf-extract")] | ||
pub mod pdf_extract_loader; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
use std::{io::Read, path::Path, pin::Pin}; | ||
|
||
use async_stream::stream; | ||
use async_trait::async_trait; | ||
use futures::Stream; | ||
use pdf_extract::{output_doc, PlainTextOutput}; | ||
|
||
use crate::{ | ||
document_loaders::{process_doc_stream, Loader, LoaderError}, | ||
schemas::Document, | ||
text_splitter::TextSplitter, | ||
}; | ||
|
||
#[derive(Debug, Clone)] | ||
pub struct PdfExtractLoader { | ||
document: lopdf::Document, | ||
} | ||
|
||
impl PdfExtractLoader { | ||
/// Creates a new PdfLoader from anything that implements the Read trait. | ||
/// This is a generic constructor which can be used with any type of reader. | ||
/// | ||
/// # Example | ||
/// | ||
/// ```rust,ignore | ||
/// use std::io::Cursor; | ||
/// let data = Cursor::new(vec![...] /* some PDF data */); | ||
/// let loader = PdfExtractLoader::new(data)?; | ||
/// ``` | ||
/// | ||
pub fn new<R: Read>(reader: R) -> Result<Self, LoaderError> { | ||
let document = lopdf::Document::load_from(reader)?; | ||
Ok(Self { document }) | ||
} | ||
/// Creates a new PdfLoader from a path to a PDF file. | ||
/// This loads the PDF document and creates a PdfLoader from it. | ||
/// | ||
/// # Example | ||
/// | ||
/// ```rust,ignore | ||
/// let loader = PdfExtractLoader::from_path("/path/to/my.pdf")?; | ||
/// ``` | ||
/// | ||
pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self, LoaderError> { | ||
let document = lopdf::Document::load(path)?; | ||
Ok(Self { document }) | ||
} | ||
} | ||
|
||
#[async_trait] | ||
impl Loader for PdfExtractLoader { | ||
async fn load( | ||
mut self, | ||
) -> Result< | ||
Pin<Box<dyn Stream<Item = Result<Document, LoaderError>> + Send + 'static>>, | ||
LoaderError, | ||
> { | ||
let mut buffer: Vec<u8> = Vec::new(); | ||
let mut output = PlainTextOutput::new(&mut buffer as &mut dyn std::io::Write); | ||
output_doc(&self.document, &mut output)?; | ||
let doc = Document::new(String::from_utf8(buffer)?); | ||
|
||
let stream = stream! { | ||
yield Ok(doc); | ||
}; | ||
|
||
Ok(Box::pin(stream)) | ||
} | ||
|
||
async fn load_and_split<TS: TextSplitter + 'static>( | ||
mut self, | ||
splitter: TS, | ||
) -> Result< | ||
Pin<Box<dyn Stream<Item = Result<Document, LoaderError>> + Send + 'static>>, | ||
LoaderError, | ||
> { | ||
let doc_stream = self.load().await?; | ||
let stream = process_doc_stream(doc_stream, splitter).await; | ||
Ok(Box::pin(stream)) | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use std::{fs::File, io::Cursor}; | ||
|
||
use futures_util::StreamExt; | ||
|
||
use super::*; | ||
|
||
#[tokio::test] | ||
async fn test_lo_pdf_loader() { | ||
let path = "./src/document_loaders/test_data/sample.pdf"; | ||
|
||
let loader = PdfExtractLoader::from_path(path).expect("Failed to create PdfExtractLoader"); | ||
|
||
let docs = loader | ||
.load() | ||
.await | ||
.unwrap() | ||
.map(|d| d.unwrap()) | ||
.collect::<Vec<_>>() | ||
.await; | ||
|
||
assert_eq!(&docs[0].page_content[..100], "\n\nSample PDF Document\n\nRobert Maron\nGrzegorz Grudzi´nski\n\nFebruary 20, 1999\n\n2\n\nContents\n\n1 Templat"); | ||
assert_eq!(docs.len(), 1); | ||
} | ||
|
||
#[tokio::test] | ||
async fn test_lo_pdf_loader_reader() { | ||
let path = "./src/document_loaders/test_data/sample.pdf"; | ||
let mut file = File::open(path).unwrap(); | ||
let mut buffer = Vec::new(); | ||
file.read_to_end(&mut buffer).unwrap(); | ||
let reader = Cursor::new(buffer); | ||
|
||
let loader = PdfExtractLoader::new(reader).expect("Failed to create PdfExtractLoader"); | ||
|
||
let docs = loader | ||
.load() | ||
.await | ||
.unwrap() | ||
.map(|d| d.unwrap()) | ||
.collect::<Vec<_>>() | ||
.await; | ||
|
||
assert_eq!(&docs[0].page_content[..100], "\n\nSample PDF Document\n\nRobert Maron\nGrzegorz Grudzi´nski\n\nFebruary 20, 1999\n\n2\n\nContents\n\n1 Templat"); | ||
assert_eq!(docs.len(), 1); | ||
} | ||
} |