Skip to content

Commit

Permalink
Merge pull request #156 from llm-tools/loaders
Browse files Browse the repository at this point in the history
Added Markdown loader
  • Loading branch information
adhityan authored Nov 5, 2024
2 parents 73cd1e7 + d17e3e3 commit df0bfa7
Show file tree
Hide file tree
Showing 91 changed files with 1,633 additions and 321 deletions.
18 changes: 12 additions & 6 deletions core/embedjs-interfaces/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
## 0.1.16 (2024-11-04)

### 🚀 Features

- added xml loader ([9172511](https://github.com/llm-tools/embedJs/commit/9172511))

## 0.1.17 (2024-11-05)

### 🚀 Features

- added markdown/mdx loader ([847947d](https://github.com/llm-tools/embedJs/commit/847947d)

## 0.1.16 (2024-11-04)

### 🚀 Features

- added xml loader ([9172511](https://github.com/llm-tools/embedJs/commit/9172511))

## 0.1.15 and 0.1.14 (2024-11-01)

### 🚀 Features
Expand Down
2 changes: 1 addition & 1 deletion core/embedjs-interfaces/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@llm-tools/embedjs-interfaces",
"version": "0.1.16",
"version": "0.1.17",
"description": "Interfaces for extending the embedjs ecosystem",
"dependencies": {
"@langchain/core": "^0.3.17",
Expand Down
18 changes: 12 additions & 6 deletions core/embedjs-utils/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
## 0.1.16 (2024-11-04)

### 🚀 Features

- added xml loader ([9172511](https://github.com/llm-tools/embedJs/commit/9172511))

## 0.1.17 (2024-11-05)

### 🚀 Features

- added markdown/mdx loader ([847947d](https://github.com/llm-tools/embedJs/commit/847947d)

## 0.1.16 (2024-11-04)

### 🚀 Features

- added xml loader ([9172511](https://github.com/llm-tools/embedJs/commit/9172511))

## 0.1.15 and 0.1.14 (2024-11-01)

### 🚀 Features
Expand Down
4 changes: 2 additions & 2 deletions core/embedjs-utils/package.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
{
"name": "@llm-tools/embedjs-utils",
"version": "0.1.16",
"version": "0.1.17",
"description": "Useful util functions when extending the embedjs ecosystem",
"dependencies": {
"@llm-tools/embedjs-interfaces": "0.1.16"
"@llm-tools/embedjs-interfaces": "0.1.17"
},
"type": "module",
"main": "./src/index.js",
Expand Down
26 changes: 16 additions & 10 deletions core/embedjs/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
## 0.1.16 (2024-11-04)

### 🚀 Features

- added xml loader ([9172511](https://github.com/llm-tools/embedJs/commit/9172511))

### 🩹 Fixes

- renamed remaining instances if vectorDb to vectorDatabase ([ca79586](https://github.com/llm-tools/embedJs/commit/ca79586))

## 0.1.17 (2024-11-05)

### 🚀 Features

- added markdown/mdx loader ([847947d](https://github.com/llm-tools/embedJs/commit/847947d)

## 0.1.16 (2024-11-04)

### 🚀 Features

- added xml loader ([9172511](https://github.com/llm-tools/embedJs/commit/9172511))

### 🩹 Fixes

- renamed remaining instances if vectorDb to vectorDatabase ([ca79586](https://github.com/llm-tools/embedJs/commit/ca79586))

## 0.1.15 and 0.1.14 (2024-11-01)

### 🚀 Features
Expand Down
6 changes: 3 additions & 3 deletions core/embedjs/package.json
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
{
"type": "module",
"name": "@llm-tools/embedjs",
"version": "0.1.16",
"version": "0.1.17",
"description": "A NodeJS RAG framework to easily work with LLMs and custom datasets",
"dependencies": {
"@langchain/textsplitters": "^0.1.0",
"@llm-tools/embedjs-interfaces": "0.1.16",
"@llm-tools/embedjs-utils": "0.1.16",
"@llm-tools/embedjs-interfaces": "0.1.17",
"@llm-tools/embedjs-utils": "0.1.17",
"debug": "^4.3.7",
"langchain": "^0.3.5",
"md5": "^2.3.0",
Expand Down
2 changes: 1 addition & 1 deletion core/embedjs/src/core/rag-application-builder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ export class RAGApplicationBuilder {
constructor() {
this.loaders = [];
this.temperature = 0.1;
this.searchResultCount = 7;
this.searchResultCount = 30;
this.model = SIMPLE_MODELS.OPENAI_GPT4_TURBO;

this.systemMessage = `You are a helpful human like chat bot. Use relevant provided context and chat history to answer the query at the end. Answer in full.
Expand Down
2 changes: 1 addition & 1 deletion core/embedjs/src/loaders/url-loader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ export class UrlLoader extends BaseLoader<{ type: 'UrlLoader' }> {
}

override async *getUnfilteredChunks() {
const response = await getSafe(this.url.toString(), { headers: { 'Accept-Encoding': '' } });
const response = await getSafe(this.url.href, { headers: { 'Accept-Encoding': '' } });
const stream = response.body as unknown as NodeJS.ReadableStream;
let { mime } = await getMimeType(stream, { strict: true });
this.debug(`Loader stream detected type '${mime}'`);
Expand Down
40 changes: 24 additions & 16 deletions core/embedjs/src/util/mime.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import createDebugMessages from 'debug';
import { BaseLoader } from '@llm-tools/embedjs-interfaces';
import { TextLoader } from '../loaders/text-loader.js';

export async function createLoaderFromMimeType(loader: string, mimeType: string): Promise<BaseLoader> {
export async function createLoaderFromMimeType(loaderData: string, mimeType: string): Promise<BaseLoader> {
createDebugMessages('embedjs:util:createLoaderFromMimeType')(`Incoming mime type '${mimeType}'`);

switch (mimeType) {
Expand All @@ -15,7 +15,7 @@ export async function createLoaderFromMimeType(loader: string, mimeType: string)
);
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported DocxLoader');
return new DocxLoader({ filePathOrUrl: loader });
return new DocxLoader({ filePathOrUrl: loaderData });
}
case 'application/vnd.ms-excel':
case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': {
Expand All @@ -25,14 +25,14 @@ export async function createLoaderFromMimeType(loader: string, mimeType: string)
);
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported ExcelLoader');
return new ExcelLoader({ filePathOrUrl: loader });
return new ExcelLoader({ filePathOrUrl: loaderData });
}
case 'application/pdf': {
const { PdfLoader } = await import('@llm-tools/embedjs-loader-pdf').catch(() => {
throw new Error('Package `@llm-tools/embedjs-loader-pdf` needs to be installed to load PDF files');
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported PdfLoader');
return new PdfLoader({ filePathOrUrl: loader });
return new PdfLoader({ filePathOrUrl: loaderData });
}
case 'application/vnd.openxmlformats-officedocument.presentationml.presentation': {
const { PptLoader } = await import('@llm-tools/embedjs-loader-msoffice').catch(() => {
Expand All @@ -41,44 +41,52 @@ export async function createLoaderFromMimeType(loader: string, mimeType: string)
);
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported PptLoader');
return new PptLoader({ filePathOrUrl: loader });
return new PptLoader({ filePathOrUrl: loaderData });
}
case 'text/plain': {
const fineType = mime.getType(loader);
createDebugMessages('embedjs:util:createLoaderFromMimeType')(`Fine type for '${loader}' is '${fineType}'`);
const fineType = mime.getType(loaderData);
createDebugMessages('embedjs:util:createLoaderFromMimeType')(
`Fine type for '${loaderData}' is '${fineType}'`,
);
if (fineType === 'text/csv') {
const { CsvLoader } = await import('@llm-tools/embedjs-loader-csv').catch(() => {
throw new Error('Package `@llm-tools/embedjs-loader-csv` needs to be installed to load csv files');
throw new Error('Package `@llm-tools/embedjs-loader-csv` needs to be installed to load CSV files');
});

createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported CsvLoader');
return new CsvLoader({ filePathOrUrl: loader });
} else return new TextLoader({ text: loader });
return new CsvLoader({ filePathOrUrl: loaderData });
} else return new TextLoader({ text: loaderData });
}
case 'application/csv': {
const { CsvLoader } = await import('@llm-tools/embedjs-loader-csv').catch(() => {
throw new Error('Package `@llm-tools/embedjs-loader-csv` needs to be installed to load csv files');
throw new Error('Package `@llm-tools/embedjs-loader-csv` needs to be installed to load CSV files');
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported CsvLoader');
return new CsvLoader({ filePathOrUrl: loader });
return new CsvLoader({ filePathOrUrl: loaderData });
}
case 'text/html': {
const { WebLoader } = await import('@llm-tools/embedjs-loader-web').catch(() => {
throw new Error('Package `@llm-tools/embedjs-loader-web` needs to be installed to load web documents');
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported WebLoader');
return new WebLoader({ urlOrContent: loader });
return new WebLoader({ urlOrContent: loaderData });
}
case 'text/xml': {
const { SitemapLoader } = await import('@llm-tools/embedjs-loader-sitemap').catch(() => {
throw new Error('Package `@llm-tools/embedjs-loader-sitemap` needs to be installed to load sitemaps');
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported SitemapLoader');

if (await SitemapLoader.test(loader)) {
return new SitemapLoader({ url: loader });
if (await SitemapLoader.test(loaderData)) {
return new SitemapLoader({ url: loaderData });
}
throw new Error(`No loader supported for generic xml`);

//This is not a Sitemap but is still XML
const { XmlLoader } = await import('@llm-tools/embedjs-loader-xml').catch(() => {
throw new Error('Package `@llm-tools/embedjs-loader-xml` needs to be installed to load XML documents');
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported XmlLoader');
return new XmlLoader({ filePathOrUrl: loaderData });
}
case undefined:
throw new Error(`MIME type could not be detected. Please file an issue if you think this is a bug.`);
Expand Down
18 changes: 12 additions & 6 deletions databases/embedjs-astra/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
## 0.1.16 (2024-11-04)

### 🚀 Features

- added xml loader ([9172511](https://github.com/llm-tools/embedJs/commit/9172511))

## 0.1.17 (2024-11-05)

### 🚀 Features

- added markdown/mdx loader ([847947d](https://github.com/llm-tools/embedJs/commit/847947d)

## 0.1.16 (2024-11-04)

### 🚀 Features

- added xml loader ([9172511](https://github.com/llm-tools/embedJs/commit/9172511))

## 0.1.15 and 0.1.14 (2024-11-01)

### 🚀 Features
Expand Down
4 changes: 2 additions & 2 deletions databases/embedjs-astra/package.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
{
"name": "@llm-tools/embedjs-astradb",
"version": "0.1.16",
"version": "0.1.17",
"description": "Add AstraDB support to embedjs",
"dependencies": {
"@datastax/astra-db-ts": "^1.5.0",
"@llm-tools/embedjs-interfaces": "0.1.16",
"@llm-tools/embedjs-interfaces": "0.1.17",
"debug": "^4.3.7"
},
"type": "module",
Expand Down
18 changes: 12 additions & 6 deletions databases/embedjs-cosmos/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
## 0.1.16 (2024-11-04)

### 🚀 Features

- added xml loader ([9172511](https://github.com/llm-tools/embedJs/commit/9172511))

## 0.1.17 (2024-11-05)

### 🚀 Features

- added markdown/mdx loader ([847947d](https://github.com/llm-tools/embedJs/commit/847947d)

## 0.1.16 (2024-11-04)

### 🚀 Features

- added xml loader ([9172511](https://github.com/llm-tools/embedJs/commit/9172511))

## 0.1.15 and 0.1.14 (2024-11-01)

### 🚀 Features
Expand Down
4 changes: 2 additions & 2 deletions databases/embedjs-cosmos/package.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
{
"name": "@llm-tools/embedjs-cosmos",
"version": "0.1.16",
"version": "0.1.17",
"description": "Add CosmosDB support to embedjs",
"dependencies": {
"@azure/cosmos": "^4.1.1",
"@llm-tools/embedjs-interfaces": "0.1.16",
"@llm-tools/embedjs-interfaces": "0.1.17",
"debug": "^4.3.7"
},
"type": "module",
Expand Down
18 changes: 12 additions & 6 deletions databases/embedjs-hnswlib/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
## 0.1.16 (2024-11-04)

### 🚀 Features

- added xml loader ([9172511](https://github.com/llm-tools/embedJs/commit/9172511))

## 0.1.17 (2024-11-05)

### 🚀 Features

- added markdown/mdx loader ([847947d](https://github.com/llm-tools/embedJs/commit/847947d)

## 0.1.16 (2024-11-04)

### 🚀 Features

- added xml loader ([9172511](https://github.com/llm-tools/embedJs/commit/9172511))

## 0.1.15 and 0.1.14 (2024-11-01)

### 🚀 Features
Expand Down
4 changes: 2 additions & 2 deletions databases/embedjs-hnswlib/package.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
{
"name": "@llm-tools/embedjs-hnswlib",
"version": "0.1.16",
"version": "0.1.17",
"description": "Add HNSWLib support to embedjs",
"dependencies": {
"@llm-tools/embedjs-interfaces": "0.1.16",
"@llm-tools/embedjs-interfaces": "0.1.17",
"debug": "^4.3.7",
"hnswlib-node": "^3.0.0"
},
Expand Down
18 changes: 12 additions & 6 deletions databases/embedjs-lancedb/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
## 0.1.16 (2024-11-04)

### 🚀 Features

- added xml loader ([9172511](https://github.com/llm-tools/embedJs/commit/9172511))

## 0.1.17 (2024-11-05)

### 🚀 Features

- added markdown/mdx loader ([847947d](https://github.com/llm-tools/embedJs/commit/847947d)

## 0.1.16 (2024-11-04)

### 🚀 Features

- added xml loader ([9172511](https://github.com/llm-tools/embedJs/commit/9172511))

## 0.1.15 and 0.1.14 (2024-11-01)

### 🚀 Features
Expand Down
4 changes: 2 additions & 2 deletions databases/embedjs-lancedb/package.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
{
"name": "@llm-tools/embedjs-lancedb",
"version": "0.1.16",
"version": "0.1.17",
"description": "Add LanceDb support to embedjs",
"dependencies": {
"@lancedb/lancedb": "^0.12.0",
"@llm-tools/embedjs-interfaces": "0.1.16",
"@llm-tools/embedjs-interfaces": "0.1.17",
"compute-cosine-similarity": "^1.1.0"
},
"type": "module",
Expand Down
18 changes: 12 additions & 6 deletions databases/embedjs-lmdb/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
## 0.1.16 (2024-11-04)

### 🚀 Features

- added xml loader ([9172511](https://github.com/llm-tools/embedJs/commit/9172511))

## 0.1.17 (2024-11-05)

### 🚀 Features

- added markdown/mdx loader ([847947d](https://github.com/llm-tools/embedJs/commit/847947d)

## 0.1.16 (2024-11-04)

### 🚀 Features

- added xml loader ([9172511](https://github.com/llm-tools/embedJs/commit/9172511))

## 0.1.15 and 0.1.14 (2024-11-01)

### 🚀 Features
Expand Down
Loading

0 comments on commit df0bfa7

Please sign in to comment.