forked from artidoro/qlora
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ingest.py
59 lines (49 loc) · 1.9 KB
/
ingest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import re
import os
import json
import math
import torch
import spacy
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceInstructEmbeddings
from chromadb.config import Settings
EMBEDDING_MODEL_NAME = "hkunlp/instructor-large" # Uses 1.5 GB of VRAM (High Accuracy with lower VRAM usage)
device_type = "cuda" if torch.cuda.is_available() else "cpu"
nlp = spacy.load("en_core_web_sm")
texts = [""]
for file in os.listdir("data/en_articles_klio"):
description = open(f"data/en_articles_klio/{file}", encoding="utf-8").read()
blocks = ["This is the description of" + block for block in description.split("This is the description of") if block]
for block in blocks:
if len(texts[-1].split()) + len(block.split()) < 1000:
texts[-1] += block
else:
if len(block.split()) >= 1000:
texts.append("")
doc = nlp(block)
for sent in doc.sents:
if len(texts[-1].split()) + len(sent.text.split()) < 1000:
texts[-1] += sent.text
else:
texts.append(sent.text.strip())
else:
texts.append(block.strip())
texts.append("")
texts = [text for text in texts if text]
chunks = [Document(page_content=text) for text in texts if text]
embeddings = HuggingFaceInstructEmbeddings(
model_name=EMBEDDING_MODEL_NAME,
model_kwargs={"device": device_type},
embed_instruction="Represent the document for retrieval:",
query_instruction="Represent the question for retrieving supporting documents:",
)
db = Chroma.from_documents(
chunks,
embeddings,
persist_directory="DB_KLIO_ALPACA",
client_settings=Settings(
anonymized_telemetry=False,
is_persistent=True,
)
)