diff --git a/docs/core_docs/docs/how_to/document_loader_html.ipynb b/docs/core_docs/docs/how_to/document_loader_html.ipynb new file mode 100644 index 000000000000..e7116a606924 --- /dev/null +++ b/docs/core_docs/docs/how_to/document_loader_html.ipynb @@ -0,0 +1,176 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0c6c50fc-15e1-4767-925a-53a37c430b9b", + "metadata": {}, + "source": [ + "# How to load HTML\n", + "\n", + "The HyperText Markup Language or [HTML](https://en.wikipedia.org/wiki/HTML) is the standard markup language for documents designed to be displayed in a web browser.\n", + "\n", + "This covers how to load `HTML` documents into a LangChain [Document](https://v02.api.js.langchain.com/classes/langchain_core_documents.Document.html) objects that we can use downstream.\n", + "\n", + "Parsing HTML files often requires specialized tools. Here we demonstrate parsing via [Unstructured](https://unstructured-io.github.io/unstructured/). Head over to the integrations page to find integrations with additional services, such as [FireCrawl](/docs/integrations/document_loaders/web_loaders/firecrawl).\n", + "\n", + ":::info Prerequisites\n", + "\n", + "This guide assumes familiarity with the following concepts:\n", + "\n", + "- [Documents](/docs/concepts#document)\n", + "- [Document Loaders](/docs/concepts#document-loaders)\n", + "\n", + ":::\n", + "\n", + "## Installation\n", + "\n", + "```{=mdx}\n", + "import Npm2Yarn from \"@theme/Npm2Yarn\"\n", + "\n", + "\n", + " @langchain/community\n", + "\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "868cfb85", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "Although Unstructured has an open source offering, you're still required to provide an API key to access the service. To get everything up and running, follow these two steps:\n", + "\n", + "1. Download & start the Docker container:\n", + " \n", + "```bash\n", + "docker run -p 8000:8000 -d --rm --name unstructured-api quay.io/unstructured-io/unstructured-api:latest --port 8000 --host 0.0.0.0\n", + "```\n", + "\n", + "2. Get a free API key & API URL [here](https://unstructured.io/api-key), and set it in your environment (as per the Unstructured website, it may take up to an hour to allocate your API key & URL.):\n", + "\n", + "```bash\n", + "export UNSTRUCTURED_API_KEY=\"...\"\n", + "# Replace with your `Full URL` from the email\n", + "export UNSTRUCTURED_API_URL=\"https://-.api.unstructuredapp.io/general/v0/general\" \n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "a4d93b2e", + "metadata": {}, + "source": [ + "## Loading HTML with Unstructured" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "7d167ca3-c7c7-4ef0-b509-080629f0f482", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\n", + " Document {\n", + " pageContent: 'Word of the Day',\n", + " metadata: {\n", + " category_depth: 0,\n", + " languages: [Array],\n", + " filename: 'wordoftheday.html',\n", + " filetype: 'text/html',\n", + " category: 'Title'\n", + " }\n", + " },\n", + " Document {\n", + " pageContent: ': April 10, 2023',\n", + " metadata: {\n", + " emphasized_text_contents: [Array],\n", + " emphasized_text_tags: [Array],\n", + " languages: [Array],\n", + " parent_id: 'b845e60d85ff7d10abda4e5f9a37eec8',\n", + " filename: 'wordoftheday.html',\n", + " filetype: 'text/html',\n", + " category: 'UncategorizedText'\n", + " }\n", + " },\n", + " Document {\n", + " pageContent: 'foible',\n", + " metadata: {\n", + " category_depth: 1,\n", + " languages: [Array],\n", + " parent_id: 'b845e60d85ff7d10abda4e5f9a37eec8',\n", + " filename: 'wordoftheday.html',\n", + " filetype: 'text/html',\n", + " category: 'Title'\n", + " }\n", + " },\n", + " Document {\n", + " pageContent: 'play',\n", + " metadata: {\n", + " category_depth: 0,\n", + " link_texts: [Array],\n", + " link_urls: [Array],\n", + " link_start_indexes: [Array],\n", + " languages: [Array],\n", + " filename: 'wordoftheday.html',\n", + " filetype: 'text/html',\n", + " category: 'Title'\n", + " }\n", + " },\n", + " Document {\n", + " pageContent: 'noun',\n", + " metadata: {\n", + " category_depth: 0,\n", + " emphasized_text_contents: [Array],\n", + " emphasized_text_tags: [Array],\n", + " languages: [Array],\n", + " filename: 'wordoftheday.html',\n", + " filetype: 'text/html',\n", + " category: 'Title'\n", + " }\n", + " }\n", + "]\n" + ] + } + ], + "source": [ + "import { UnstructuredLoader } from \"@langchain/community/document_loaders/fs/unstructured\";\n", + "\n", + "const filePath = \"../../../../libs/langchain-community/src/tools/fixtures/wordoftheday.html\"\n", + "\n", + "const loader = new UnstructuredLoader(filePath, {\n", + " apiKey: process.env.UNSTRUCTURED_API_KEY,\n", + " apiUrl: process.env.UNSTRUCTURED_API_URL,\n", + "});\n", + "\n", + "const data = await loader.load()\n", + "console.log(data.slice(0, 5));" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "TypeScript", + "language": "typescript", + "name": "tslab" + }, + "language_info": { + "codemirror_mode": { + "mode": "typescript", + "name": "javascript", + "typescript": true + }, + "file_extension": ".ts", + "mimetype": "text/typescript", + "name": "typescript", + "version": "3.7.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/core_docs/docs/how_to/document_loader_markdown.ipynb b/docs/core_docs/docs/how_to/document_loader_markdown.ipynb new file mode 100644 index 000000000000..a72849c739df --- /dev/null +++ b/docs/core_docs/docs/how_to/document_loader_markdown.ipynb @@ -0,0 +1,295 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d836a98a-ad14-4bed-af76-e1877f7ef8a4", + "metadata": {}, + "source": [ + "# How to load Markdown\n", + "\n", + "[Markdown](https://en.wikipedia.org/wiki/Markdown) is a lightweight markup language for creating formatted text using a plain-text editor.\n", + "\n", + "Here we cover how to load `Markdown` documents into LangChain [Document](https://v02.api.js.langchain.com/classes/langchain_core_documents.Document.html) objects that we can use downstream.\n", + "\n", + "We will cover:\n", + "\n", + "- Basic usage;\n", + "- Parsing of Markdown into elements such as titles, list items, and text.\n", + "\n", + "LangChain implements an [UnstructuredLoader](https://v02.api.js.langchain.com/classes/langchain_document_loaders_fs_unstructured.UnstructuredLoader.html) class.\n", + "\n", + ":::info Prerequisites\n", + "\n", + "This guide assumes familiarity with the following concepts:\n", + "\n", + "- [Documents](/docs/concepts#document)\n", + "- [Document Loaders](/docs/concepts#document-loaders)\n", + "\n", + ":::\n", + "\n", + "## Installation\n", + "\n", + "```{=mdx}\n", + "import Npm2Yarn from \"@theme/Npm2Yarn\"\n", + "\n", + "\n", + " @langchain/community\n", + "\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "897a69e9", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "Although Unstructured has an open source offering, you're still required to provide an API key to access the service. To get everything up and running, follow these two steps:\n", + "\n", + "1. Download & start the Docker container:\n", + " \n", + "```bash\n", + "docker run -p 8000:8000 -d --rm --name unstructured-api quay.io/unstructured-io/unstructured-api:latest --port 8000 --host 0.0.0.0\n", + "```\n", + "\n", + "2. Get a free API key & API URL [here](https://unstructured.io/api-key), and set it in your environment (as per the Unstructured website, it may take up to an hour to allocate your API key & URL.):\n", + "\n", + "```bash\n", + "export UNSTRUCTURED_API_KEY=\"...\"\n", + "# Replace with your `Full URL` from the email\n", + "export UNSTRUCTURED_API_URL=\"https://-.api.unstructuredapp.io/general/v0/general\" \n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "ea8c41f8-a8dc-48cc-b78d-7b3e2427a34c", + "metadata": {}, + "source": [ + "Basic usage will ingest a Markdown file to a single document. Here we demonstrate on LangChain's readme:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "80c50cc4-7ce9-4418-81b9-29c52c7b3627", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\n", + " Document {\n", + " pageContent: '🦜️🔗 LangChain.js',\n", + " metadata: {\n", + " languages: [Array],\n", + " filename: 'README.md',\n", + " filetype: 'text/markdown',\n", + " category: 'Title'\n", + " }\n", + " },\n", + " Document {\n", + " pageContent: '⚡ Building applications with LLMs through composability ⚡',\n", + " metadata: {\n", + " languages: [Array],\n", + " filename: 'README.md',\n", + " filetype: 'text/markdown',\n", + " category: 'Title'\n", + " }\n", + " },\n", + " Document {\n", + " pageContent: 'Looking for the Python version? Check out LangChain.',\n", + " metadata: {\n", + " languages: [Array],\n", + " parent_id: '7ea17bcb17b10f303cbb93b4cb95de93',\n", + " filename: 'README.md',\n", + " filetype: 'text/markdown',\n", + " category: 'NarrativeText'\n", + " }\n", + " },\n", + " Document {\n", + " pageContent: 'To help you ship LangChain apps to production faster, check out LangSmith.\\n' +\n", + " 'LangSmith is a unified developer platform for building, testing, and monitoring LLM applications.\\n' +\n", + " 'Fill out this form to get on the waitlist or speak with our sales team.',\n", + " metadata: {\n", + " languages: [Array],\n", + " parent_id: '7ea17bcb17b10f303cbb93b4cb95de93',\n", + " filename: 'README.md',\n", + " filetype: 'text/markdown',\n", + " category: 'NarrativeText'\n", + " }\n", + " },\n", + " Document {\n", + " pageContent: '⚡️ Quick Install',\n", + " metadata: {\n", + " languages: [Array],\n", + " filename: 'README.md',\n", + " filetype: 'text/markdown',\n", + " category: 'Title'\n", + " }\n", + " }\n", + "]\n" + ] + } + ], + "source": [ + "import { UnstructuredLoader } from \"@langchain/community/document_loaders/fs/unstructured\";\n", + "\n", + "const markdownPath = \"../../../../README.md\";\n", + "\n", + "const loader = new UnstructuredLoader(markdownPath, {\n", + " apiKey: process.env.UNSTRUCTURED_API_KEY,\n", + " apiUrl: process.env.UNSTRUCTURED_API_URL,\n", + "});\n", + "\n", + "const data = await loader.load()\n", + "console.log(data.slice(0, 5));" + ] + }, + { + "cell_type": "markdown", + "id": "b7560a6e-ca5d-47e1-b176-a9c40e763ff3", + "metadata": {}, + "source": [ + "## Retain Elements\n", + "\n", + "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `chunkingStrategy: \"by_title\"`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a986bbce-7fd3-41d1-bc47-49f9f57c7cd1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of documents: 13\n", + "\n", + "Document {\n", + " pageContent: '🦜️🔗 LangChain.js\\n' +\n", + " '\\n' +\n", + " '⚡ Building applications with LLMs through composability ⚡\\n' +\n", + " '\\n' +\n", + " 'Looking for the Python version? Check out LangChain.\\n' +\n", + " '\\n' +\n", + " 'To help you ship LangChain apps to production faster, check out LangSmith.\\n' +\n", + " 'LangSmith is a unified developer platform for building, testing, and monitoring LLM applications.\\n' +\n", + " 'Fill out this form to get on the waitlist or speak with our sales team.',\n", + " metadata: {\n", + " filename: 'README.md',\n", + " filetype: 'text/markdown',\n", + " languages: [ 'eng' ],\n", + " orig_elements: 'eJzNUtuO0zAQ/ZVRnquSS3PjBcGyPHURgr5tV2hijxNTJ45ip0u14t8Zp1y6CCF4ACFLlufuc+bcPkRkqKfBv9cyegpREWNZosxS0RRVzmeTCiFlnmRUFZmQ0QqinjxK9Mj5D5HShgbsKRS/vX7+8uZ63S9ZIeBP4xLw9NE/6XxvQsDg0M7YkuPIbURDG919Wp1zQu5+llVGfMta7GdFsVo8MniSErZcfdWhHtYfXOj2dcROe0MRN/oRUUmYlI1o+EpilcWZaJo6azaiqXNJdfYvEKUFJvBi1kbqoQUcR6MFem0HB/fad7Dd3jjw3WTntgNh+9E6bLTR/gTn4t9CmhHFTc1w80oKSUlTpFWaFKWsVR5nFf0dpOwdcfoDvi+p2Vp7CJQoOzF+gjcn39kBjjQ5ZucZXHUkDmBnf7H3Sy5e4zQxkUfahYY/4UQqVcZJpSpspKqSMslVllWJzDdMC6XVf8jJzkJHZoSTncF1evwOPSiHdWJhnKycRRAQKHSephWIR0y961lW6/3w7Q3aAcI8aKVJgqQjGTvSBKNBz+T3ywaaLwpdgSfnlwcOEno7aG+nsCcW6iP58ohX2phlru94xtKLf9iSB/5d2Ok9smC1Y3sCNxIezpq3M5toiAER9r/a6t1n6BJ/zg==',\n", + " category: 'CompositeElement'\n", + " }\n", + "}\n", + "\n", + "\n", + "Document {\n", + " pageContent: '⚡️ Quick Install\\n' +\n", + " '\\n' +\n", + " 'You can use npm, yarn, or pnpm to install LangChain.js\\n' +\n", + " '\\n' +\n", + " 'npm install -S langchain or yarn add langchain or pnpm add langchain\\n' +\n", + " '\\n' +\n", + " 'typescript\\n' +\n", + " 'import { ChatOpenAI } from \"langchain/chat_models/openai\";\\n' +\n", + " '\\n' +\n", + " '🌐 Supported Environments\\n' +\n", + " '\\n' +\n", + " 'LangChain is written in TypeScript and can be used in:\\n' +\n", + " '\\n' +\n", + " 'Node.js (ESM and CommonJS) - 18.x, 19.x, 20.x\\n' +\n", + " '\\n' +\n", + " 'Cloudflare Workers\\n' +\n", + " '\\n' +\n", + " 'Vercel / Next.js (Browser, Serverless and Edge functions)\\n' +\n", + " '\\n' +\n", + " 'Supabase Edge Functions\\n' +\n", + " '\\n' +\n", + " 'Browser\\n' +\n", + " '\\n' +\n", + " 'Deno',\n", + " metadata: {\n", + " filename: 'README.md',\n", + " filetype: 'text/markdown',\n", + " languages: [ 'eng' ],\n", + " orig_elements: 'eJzNlm1v2zYQx7/KQa9WwE1Iik/qXnWpB2RoM2wOOgx1URzJY6pVogyJTlME/e6j3KZIhgBzULjIG0Li3VH+/e/BfHNdUUc9pfyuDdUzqGzUjUUda1ZbL7R1UQetnNdMK9swVy2g6iljwIzF/7qKbUcJe5qD/1w+f/FqedSH2Ws25E+bnSHTVT5+n/tuNnSYLrZ4QVOxvKkoXVRvPy+++My+663QyNfbSCzCH9vWf4DTNGXsdsE3J563uaOqxP0XIDSxCdobSZIYd9w7JpQlLU3TaKf4YQDK7gbHB8h4m/jvYQseE2wngrTpF/AJx7SAYYRNeYU8QPtFAHhZvnzyHtt09M90W40zHEfM7SWdz0fep0otuUISLBqMjfNFjMYzI6SWFFWQj1CVGf2G++kK5uP9jD7rMgsEGMLd3Z1ad3YfpJHWsubSchGQeNRItUGPElF7wck2hy/9OWbyY7vJ69T2m2HMcA0l3/n3DaXnp/AZ4jj0sK6+AR6XNb/rh0DddDwUL2zX1c97NUpjVAEOxkh0tbOaN1qU1vG8VtYGe6CSuNvpwda+rJEzWG03MzAFWKbLdhzS/FOnvUhcdChlNC6iKBWuJVrCGMhxIaKMP6i4/1fP2+jfGhnaCT6Obc5UHhOcl4+vdhUAmMJuKjiaB0Mo1mcPKmdBvlFWK6ZMaXfNI2ojIvNORMsUHWiSf5cqZ6WOy2SDn5arVzv+k6Hvh/Tb6gk8BW6PrhbAm3kV7Ojqthgv2ymfZurvrQ4hvRLCSaUEj8YG77TzQTNriYv6B/0hPEiHk24oTdGVePhrGD/QOO0LyxRHKZivAxldS41akzXcxELPm/oxJv01jZ46OIazsrHL/i/j8HGicQErGi9p7GiadtWwDBcEcZt8boc0PdlXE9KlAoSkZh4PtUBZ5oRjTAbiSgd3oLn+XZqUYYgOy3Vgh/zrDfK+xA0rqY6GaQrGo5JM1azcgawzjeOa2CMk/przvXMayvXQEA8meEmCsxiDrkO54/iAVvtHSPiC0nA/3tt/AY+igwk=',\n", + " category: 'CompositeElement'\n", + " }\n", + "}\n", + "\n", + "\n" + ] + } + ], + "source": [ + "const loader = new UnstructuredLoader(markdownPath, {\n", + " chunkingStrategy: \"by_title\"\n", + "});\n", + "\n", + "\n", + "const data = await loader.load()\n", + "\n", + "console.log(`Number of documents: ${data.length}\\n`)\n", + "\n", + "for (const doc of data.slice(0, 2)) {\n", + " console.log(doc);\n", + " console.log(\"\\n\");\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "117dc6b0-9baa-44a2-9d1d-fc38ecf7a233", + "metadata": {}, + "source": [ + "Note that in this case we recover just one distinct element type:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "75abc139-3ded-4e8e-9f21-d0c8ec40fdfc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Set(1) { 'CompositeElement' }\n" + ] + } + ], + "source": [ + "const categories = new Set(data.map((document) => document.metadata.category));\n", + "console.log(categories);" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "TypeScript", + "language": "typescript", + "name": "tslab" + }, + "language_info": { + "codemirror_mode": { + "mode": "typescript", + "name": "javascript", + "typescript": true + }, + "file_extension": ".ts", + "mimetype": "text/typescript", + "name": "typescript", + "version": "3.7.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/core_docs/docs/how_to/index.mdx b/docs/core_docs/docs/how_to/index.mdx index cab7fc40414d..5d5e27e74481 100644 --- a/docs/core_docs/docs/how_to/index.mdx +++ b/docs/core_docs/docs/how_to/index.mdx @@ -100,6 +100,8 @@ Document Loaders are responsible for loading documents from a variety of sources - [How to: load data from a directory](/docs/how_to/document_loader_directory) - [How to: load PDF files](/docs/how_to/document_loader_pdf) - [How to: write a custom document loader](/docs/how_to/document_loader_custom) +- [How to: load HTML data](/docs/how_to/document_loader_html) +- [How to: load Markdown data](/docs/how_to/document_loader_markdown) ### Text splitters diff --git a/examples/src/document_loaders/firecrawl.ts b/examples/src/document_loaders/firecrawl.ts index d8524e1b4b6c..3eb4b46e4eb9 100644 --- a/examples/src/document_loaders/firecrawl.ts +++ b/examples/src/document_loaders/firecrawl.ts @@ -1,4 +1,4 @@ -import { FireCrawlLoader } from "langchain/document_loaders/web/firecrawl"; +import { FireCrawlLoader } from "@langchain/community/document_loaders/web/firecrawl"; const loader = new FireCrawlLoader({ url: "https://firecrawl.dev", // The URL to scrape diff --git a/libs/langchain-community/src/document_loaders/fs/unstructured.ts b/libs/langchain-community/src/document_loaders/fs/unstructured.ts index 1048018e3d94..f9040b11110a 100644 --- a/libs/langchain-community/src/document_loaders/fs/unstructured.ts +++ b/libs/langchain-community/src/document_loaders/fs/unstructured.ts @@ -1,7 +1,7 @@ import type { basename as BasenameT } from "node:path"; import type { readFile as ReadFileT } from "node:fs/promises"; import { Document } from "@langchain/core/documents"; -import { getEnv } from "@langchain/core/utils/env"; +import { getEnv, getEnvironmentVariable } from "@langchain/core/utils/env"; import { StringWithAutocomplete } from "@langchain/core/utils/types"; import { DirectoryLoader, @@ -181,8 +181,12 @@ export class UnstructuredLoader extends BaseDocumentLoader { } else { this.filePath = filePathOrLegacyApiUrl; const options = optionsOrLegacyFilePath; - this.apiKey = options.apiKey; - this.apiUrl = options.apiUrl ?? this.apiUrl; + this.apiKey = + options.apiKey ?? getEnvironmentVariable("UNSTRUCTURED_API_KEY"); + this.apiUrl = + options.apiUrl ?? + getEnvironmentVariable("UNSTRUCTURED_API_URL") ?? + this.apiUrl; this.strategy = options.strategy ?? this.strategy; this.encoding = options.encoding; this.ocrLanguages = options.ocrLanguages ?? this.ocrLanguages;