-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01.ingest-data.js
64 lines (56 loc) · 2.17 KB
/
01.ingest-data.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import { PDFLoader } from "@langchain/community/document_loaders/fs/pdf";
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
import { MongoClient } from "mongodb";
import getEmbeddings from "./02.get-embeddings.js";
import * as fs from "fs";
// main function of the operation. i.e. starting of the operation
async function ingestData() {
const client = new MongoClient(process.env.ATLAS_CONNECTION_STRING);
try {
// Save online PDF as a file
const rawData = await fetch(
"https://pub-5cd27299eac74caa8dfae0dc8ee78e15.r2.dev/test-bucket/Sci.Tech-RevandScience.pdf"
);
const pdfBuffer = await rawData.arrayBuffer();
const pdfData = Buffer.from(pdfBuffer);
fs.writeFileSync("Sci.Tech-RevandScience.pdf", pdfData);
const loader = new PDFLoader(`Sci.Tech-RevandScience.pdf`);
const data = await loader.load();
// Chunk the text from the PDF
const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize: 400,
chunkOverlap: 20,
});
const docs = await textSplitter.splitDocuments(data);
console.log(`Successfully chunked the PDF into ${docs.length} documents.`);
// Connect to your Atlas cluster
await client.connect();
const db = client.db("rag_db");
const collection = db.collection("test");
console.log("Generating embeddings and inserting documents.");
let docCount = 0;
// Process documents in batches of 50
const batchSize = 50;
for (let i = 0; i < docs.length; i += batchSize) {
const batch = docs.slice(i, i + batchSize);
await Promise.all(
batch.map(async (doc) => {
const embeddings = await getEmbeddings(doc.pageContent);
// Insert the embeddings and the chunked PDF data into Atlas
await collection.insertOne({
document: doc,
embedding: embeddings,
});
docCount += 1;
})
);
console.log(`Successfully inserted ${docCount} documents in this batch.`);
}
console.log(`Successfully inserted a total of ${docCount} documents.`);
} catch (err) {
console.log(err.stack);
} finally {
await client.close();
}
}
ingestData().catch(console.dir);