Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: additional metadata added to Spider tool #2923

Merged
merged 5 commits into from
Aug 2, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 65 additions & 3 deletions packages/components/nodes/documentloaders/Spider/Spider.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import { omit } from 'lodash'
import { TextSplitter } from 'langchain/text_splitter'
import { Document, DocumentInterface } from '@langchain/core/documents'
import { BaseDocumentLoader } from 'langchain/document_loaders/base'
Expand All @@ -10,6 +11,7 @@ interface SpiderLoaderParameters {
apiKey?: string
mode?: 'crawl' | 'scrape'
limit?: number
additionalMetadata?: Record<string, unknown>
params?: Record<string, unknown>
}

Expand All @@ -18,11 +20,12 @@ class SpiderLoader extends BaseDocumentLoader {
private url: string
private mode: 'crawl' | 'scrape'
private limit?: number
private additionalMetadata?: Record<string, unknown>
private params?: Record<string, unknown>

constructor(loaderParams: SpiderLoaderParameters) {
super()
const { apiKey, url, mode = 'crawl', limit, params } = loaderParams
const { apiKey, url, mode = 'crawl', limit, additionalMetadata, params } = loaderParams
if (!apiKey) {
throw new Error('Spider API key not set. You can set it as SPIDER_API_KEY in your .env file, or pass it to Spider.')
}
Expand All @@ -31,6 +34,7 @@ class SpiderLoader extends BaseDocumentLoader {
this.url = url
this.mode = mode
this.limit = Number(limit)
this.additionalMetadata = additionalMetadata
this.params = params
}

Expand Down Expand Up @@ -61,7 +65,10 @@ class SpiderLoader extends BaseDocumentLoader {
(doc) =>
new Document({
pageContent: doc.content || '',
metadata: { source: doc.url }
metadata: {
...(this.additionalMetadata || {}),
source: doc.url
}
})
)
}
Expand Down Expand Up @@ -125,6 +132,14 @@ class Spider_DocumentLoaders implements INode {
type: 'number',
default: 25
},
{
label: 'Additional Metadata',
name: 'additional_metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Additional Parameters',
name: 'params',
Expand All @@ -134,6 +149,17 @@ class Spider_DocumentLoaders implements INode {
placeholder: '{ "anti_bot": true }',
type: 'json',
optional: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma. Use * to omit all metadata keys execept the ones you specify in the Additional Metadata field',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
]
this.credential = {
Expand All @@ -149,18 +175,39 @@ class Spider_DocumentLoaders implements INode {
const url = nodeData.inputs?.url as string
const mode = nodeData.inputs?.mode as 'crawl' | 'scrape'
const limit = nodeData.inputs?.limit as number
let additionalMetadata = nodeData.inputs?.additional_metadata
let params = nodeData.inputs?.params || {}
const credentialData = await getCredentialData(nodeData.credential ?? '', options)
const spiderApiKey = getCredentialParam('spiderApiKey', credentialData, nodeData)
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string

let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}

if (typeof params === 'string') {
try {
params = JSON.parse(params)
} catch (e) {
throw new Error('Invalid JSON string provided for params')
console.error('Invalid JSON string provided for params')
}
}

if (additionalMetadata) {
if (typeof additionalMetadata === 'string') {
try {
additionalMetadata = JSON.parse(additionalMetadata)
} catch (e) {
console.error('Invalid JSON string provided for additional metadata')
}
} else if (typeof additionalMetadata !== 'object') {
console.error('Additional metadata must be a valid JSON object')
}
} else {
additionalMetadata = {}
}

// Ensure return_format is set to markdown
params.return_format = 'markdown'

Expand All @@ -169,6 +216,7 @@ class Spider_DocumentLoaders implements INode {
mode: mode as 'crawl' | 'scrape',
apiKey: spiderApiKey,
limit: limit as number,
additionalMetadata: additionalMetadata as Record<string, unknown>,
params: params as Record<string, unknown>
}

Expand All @@ -182,6 +230,20 @@ class Spider_DocumentLoaders implements INode {
docs = await loader.load()
}

docs = docs.map((doc: DocumentInterface) => ({
...doc,
metadata:
_omitMetadataKeys === '*'
? additionalMetadata
: omit(
{
...doc.metadata,
...additionalMetadata
},
omitMetadataKeys
)
}))

return docs
}
}
Expand Down
Loading