Skip to content

Commit

Permalink
Hugging Face Loader: Add lazy load (#4799)
Browse files Browse the repository at this point in the history
# Add lazy load to HF datasets loader

Unfortunately, there are no tests as far as i can tell. Verified code manually.
  • Loading branch information
eyurtsev authored May 17, 2023
1 parent a63ab7d commit 2d20a11
Showing 1 changed file with 11 additions and 8 deletions.
19 changes: 11 additions & 8 deletions langchain/document_loaders/hugging_face_dataset.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Loader that loads HuggingFace datasets."""
from typing import List, Mapping, Optional, Sequence, Union
from typing import Iterator, List, Mapping, Optional, Sequence, Union

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
Expand All @@ -23,8 +23,7 @@ def __init__(
use_auth_token: Optional[Union[bool, str]] = None,
num_proc: Optional[int] = None,
):
"""
Initialize the HuggingFaceDatasetLoader.
"""Initialize the HuggingFaceDatasetLoader.
Args:
path: Path or name of the dataset.
Expand All @@ -50,8 +49,10 @@ def __init__(
self.use_auth_token = use_auth_token
self.num_proc = num_proc

def load(self) -> List[Document]:
"""Load documents."""
def lazy_load(
self,
) -> Iterator[Document]:
"""Load documents lazily."""
try:
from datasets import load_dataset
except ImportError:
Expand All @@ -72,13 +73,15 @@ def load(self) -> List[Document]:
num_proc=self.num_proc,
)

docs = [
yield from (
Document(
page_content=row.pop(self.page_content_column),
metadata=row,
)
for key in dataset.keys()
for row in dataset[key]
]
)

return docs
def load(self) -> List[Document]:
"""Load documents."""
return list(self.lazy_load())

0 comments on commit 2d20a11

Please sign in to comment.