From 278c58687f66835b88096574d8c066b5240b6c1a Mon Sep 17 00:00:00 2001 From: William Espegren Date: Thu, 1 Aug 2024 14:09:32 +0200 Subject: [PATCH 1/4] feat: additional metadata added to Spider tool --- .../nodes/documentloaders/Spider/Spider.ts | 34 +++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/packages/components/nodes/documentloaders/Spider/Spider.ts b/packages/components/nodes/documentloaders/Spider/Spider.ts index e4817ac974e..14862de8677 100644 --- a/packages/components/nodes/documentloaders/Spider/Spider.ts +++ b/packages/components/nodes/documentloaders/Spider/Spider.ts @@ -10,6 +10,7 @@ interface SpiderLoaderParameters { apiKey?: string mode?: 'crawl' | 'scrape' limit?: number + additionalMetadata?: Record params?: Record } @@ -18,11 +19,12 @@ class SpiderLoader extends BaseDocumentLoader { private url: string private mode: 'crawl' | 'scrape' private limit?: number + private additionalMetadata?: Record private params?: Record constructor(loaderParams: SpiderLoaderParameters) { super() - const { apiKey, url, mode = 'crawl', limit, params } = loaderParams + const { apiKey, url, mode = 'crawl', limit, additionalMetadata, params } = loaderParams if (!apiKey) { throw new Error('Spider API key not set. You can set it as SPIDER_API_KEY in your .env file, or pass it to Spider.') } @@ -31,6 +33,7 @@ class SpiderLoader extends BaseDocumentLoader { this.url = url this.mode = mode this.limit = Number(limit) + this.additionalMetadata = additionalMetadata this.params = params } @@ -61,7 +64,10 @@ class SpiderLoader extends BaseDocumentLoader { (doc) => new Document({ pageContent: doc.content || '', - metadata: { source: doc.url } + metadata: { + ...(this.additionalMetadata || {}), + source: doc.url + } }) ) } @@ -125,6 +131,14 @@ class Spider_DocumentLoaders implements INode { type: 'number', default: 25 }, + { + label: 'Additional Metadata', + name: 'additional_metadata', + type: 'json', + description: 'Additional metadata to be added to the extracted documents', + optional: true, + additionalParams: true + }, { label: 'Additional Parameters', name: 'params', @@ -149,6 +163,7 @@ class Spider_DocumentLoaders implements INode { const url = nodeData.inputs?.url as string const mode = nodeData.inputs?.mode as 'crawl' | 'scrape' const limit = nodeData.inputs?.limit as number + let additionalMetadata = nodeData.inputs?.additional_metadata let params = nodeData.inputs?.params || {} const credentialData = await getCredentialData(nodeData.credential ?? '', options) const spiderApiKey = getCredentialParam('spiderApiKey', credentialData, nodeData) @@ -161,6 +176,20 @@ class Spider_DocumentLoaders implements INode { } } + if (additionalMetadata) { + if (typeof additionalMetadata === 'string') { + try { + additionalMetadata = JSON.parse(additionalMetadata) + } catch (e) { + throw new Error('Invalid JSON string provided for additional metadata') + } + } else if (typeof additionalMetadata !== 'object') { + throw new Error('Additional metadata must be a valid JSON object') + } + } else { + additionalMetadata = {} + } + // Ensure return_format is set to markdown params.return_format = 'markdown' @@ -169,6 +198,7 @@ class Spider_DocumentLoaders implements INode { mode: mode as 'crawl' | 'scrape', apiKey: spiderApiKey, limit: limit as number, + additionalMetadata: additionalMetadata as Record, params: params as Record } From 8bb404c948fdf09c2203af649a7e94fd6f0783b5 Mon Sep 17 00:00:00 2001 From: William Espegren Date: Thu, 1 Aug 2024 14:20:39 +0200 Subject: [PATCH 2/4] console.log error instead of throwing to keep flow --- packages/components/nodes/documentloaders/Spider/Spider.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/components/nodes/documentloaders/Spider/Spider.ts b/packages/components/nodes/documentloaders/Spider/Spider.ts index 14862de8677..d2d4afadadd 100644 --- a/packages/components/nodes/documentloaders/Spider/Spider.ts +++ b/packages/components/nodes/documentloaders/Spider/Spider.ts @@ -172,7 +172,7 @@ class Spider_DocumentLoaders implements INode { try { params = JSON.parse(params) } catch (e) { - throw new Error('Invalid JSON string provided for params') + console.error('Invalid JSON string provided for params') } } @@ -181,10 +181,10 @@ class Spider_DocumentLoaders implements INode { try { additionalMetadata = JSON.parse(additionalMetadata) } catch (e) { - throw new Error('Invalid JSON string provided for additional metadata') + console.error('Invalid JSON string provided for additional metadata') } } else if (typeof additionalMetadata !== 'object') { - throw new Error('Additional metadata must be a valid JSON object') + console.error('Additional metadata must be a valid JSON object') } } else { additionalMetadata = {} From 9df5164969f765538cd2697829a56e91780a3289 Mon Sep 17 00:00:00 2001 From: William Espegren Date: Thu, 1 Aug 2024 18:16:19 +0200 Subject: [PATCH 3/4] add omit --- .../nodes/documentloaders/Spider/Spider.ts | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/packages/components/nodes/documentloaders/Spider/Spider.ts b/packages/components/nodes/documentloaders/Spider/Spider.ts index d2d4afadadd..7a28741dea8 100644 --- a/packages/components/nodes/documentloaders/Spider/Spider.ts +++ b/packages/components/nodes/documentloaders/Spider/Spider.ts @@ -1,3 +1,4 @@ +import { omit } from 'lodash' import { TextSplitter } from 'langchain/text_splitter' import { Document, DocumentInterface } from '@langchain/core/documents' import { BaseDocumentLoader } from 'langchain/document_loaders/base' @@ -148,6 +149,17 @@ class Spider_DocumentLoaders implements INode { placeholder: '{ "anti_bot": true }', type: 'json', optional: true + }, + { + label: 'Omit Metadata Keys', + name: 'omitMetadataKeys', + type: 'string', + rows: 4, + description: + 'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma. Use * to omit all metadata keys execept the ones you specify in the Additional Metadata field', + placeholder: 'key1, key2, key3.nestedKey1', + optional: true, + additionalParams: true } ] this.credential = { @@ -167,6 +179,12 @@ class Spider_DocumentLoaders implements INode { let params = nodeData.inputs?.params || {} const credentialData = await getCredentialData(nodeData.credential ?? '', options) const spiderApiKey = getCredentialParam('spiderApiKey', credentialData, nodeData) + const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string + + let omitMetadataKeys: string[] = [] + if (_omitMetadataKeys) { + omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim()) + } if (typeof params === 'string') { try { @@ -212,6 +230,20 @@ class Spider_DocumentLoaders implements INode { docs = await loader.load() } + docs = docs.map((doc: DocumentInterface) => ({ + ...doc, + metadata: + _omitMetadataKeys === '*' + ? additionalMetadata + : omit( + { + ...doc.metadata, + ...additionalMetadata + }, + omitMetadataKeys + ) + })) + return docs } } From c21686cf5813465786108553300af2ab093d8047 Mon Sep 17 00:00:00 2001 From: William Espegren Date: Thu, 1 Aug 2024 18:20:56 +0200 Subject: [PATCH 4/4] pnpm lint --- .../nodes/documentloaders/Spider/Spider.ts | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/packages/components/nodes/documentloaders/Spider/Spider.ts b/packages/components/nodes/documentloaders/Spider/Spider.ts index 7a28741dea8..3dbb4baf5ba 100644 --- a/packages/components/nodes/documentloaders/Spider/Spider.ts +++ b/packages/components/nodes/documentloaders/Spider/Spider.ts @@ -236,12 +236,12 @@ class Spider_DocumentLoaders implements INode { _omitMetadataKeys === '*' ? additionalMetadata : omit( - { - ...doc.metadata, - ...additionalMetadata - }, - omitMetadataKeys - ) + { + ...doc.metadata, + ...additionalMetadata + }, + omitMetadataKeys + ) })) return docs