Skip to content

Commit

Permalink
Added support to index py, ipynb, md, and R files (#89)
Browse files Browse the repository at this point in the history
  • Loading branch information
3coins authored Apr 18, 2023
1 parent 77a6256 commit 4393367
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 2 deletions.
5 changes: 3 additions & 2 deletions packages/jupyter-ai/jupyter_ai/actors/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@
from ray.util.queue import Queue
from jupyter_core.paths import jupyter_data_dir
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from jupyter_ai.document_loaders.directory import DirectoryLoader


@ray.remote
Expand Down Expand Up @@ -42,7 +43,7 @@ def process_message(self, message: HumanChatMessage):
load_path = os.path.join(self.root_dir, dir_path)
loader = DirectoryLoader(
load_path,
glob="**/*.txt",
glob=['*.txt', '*.text', '*.md', '*.py', '*.ipynb', '*.R'],
loader_cls=TextLoader
)
documents = loader.load_and_split(
Expand Down
Empty file.
54 changes: 54 additions & 0 deletions packages/jupyter-ai/jupyter_ai/document_loaders/directory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import logging
from typing import Dict, List, Optional, Union

from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.directory import FILE_LOADER_TYPE, _is_visible
from langchain.document_loaders.unstructured import UnstructuredFileLoader
from langchain.schema import Document
from wcmatch.pathlib import Path


logger = logging.getLogger(__name__)


class DirectoryLoader(BaseLoader):
"""Loading logic for loading documents from a directory."""

def __init__(
self,
path: str,
glob: Union[str, List[str]] = "**/[!.]*",
silent_errors: bool = False,
load_hidden: bool = False,
loader_cls: FILE_LOADER_TYPE = UnstructuredFileLoader,
loader_kwargs: Optional[Dict] = None,
recursive: bool = False,
):
"""Initialize with path to directory and how to glob over it."""
if loader_kwargs is None:
loader_kwargs = {}
self.path = path
self.glob = glob
self.load_hidden = load_hidden
self.loader_cls = loader_cls
self.loader_kwargs = loader_kwargs
self.silent_errors = silent_errors
self.recursive = recursive

def load(self) -> List[Document]:
"""Load documents."""
p = Path(self.path)
docs = []
items = p.rglob(self.glob) if self.recursive else p.glob(self.glob)
for i in items:
if i.is_file():
if _is_visible(i.relative_to(p)) or self.load_hidden:
try:
sub_docs = self.loader_cls(str(i), **self.loader_kwargs).load()
docs.extend(sub_docs)
except Exception as e:
if self.silent_errors:
logger.warning(e)
else:
raise e
return docs
1 change: 1 addition & 0 deletions packages/jupyter-ai/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ dependencies = [
"jupyter_ai_magics",
"ray==2.2.0", # latest ray version 2.3.0 requires grpcio installation from conda
"faiss-cpu", # Not distributed by official repo
"wcmatch",
]

dynamic = ["version", "description", "authors", "urls", "keywords"]
Expand Down

0 comments on commit 4393367

Please sign in to comment.