feat: Support anchor linking.

oramasearch · Aug 7, 2023 · 5000d1d · 5000d1d
1 parent c8a0fa0
commit 5000d1d
Show file tree

Hide file tree

Showing 9 changed files with 96 additions and 30 deletions.
diff --git a/packages/docs/pages/plugins/plugin-parsedoc.mdx b/packages/docs/pages/plugins/plugin-parsedoc.mdx
@@ -53,6 +53,7 @@ An asynchronous function that takes three arguments:
 - `globPath`: a string representing a glob path to reading the files from.
 - `options`: an object containing the following properties:
   - `transformFn` (optional): a function that passes an object as its only argument. It contains the raw HTML/Markdown chunk, tag name, parsed content and html attributes.
+    If the function adds a `additionalProperties` object to the trasformed node, it will be merged with the original node's properties.
   - `mergeStrategy` (optional): a value that defines how to handle consecutive chunks of the same tag. The default value is `merge`. Accepted values are:
     - `merge`: consecutive chunks with the same tag will be merged into one document for the index.
     - `split`: consecutive chunks with the same tag will be split into separate documents for the index.
@@ -67,6 +68,7 @@ A asynchronous function that takes three arguments. Should be used internally by
 - `fileType`: a string representing the file type. Accepted values are `html` and `md`.
 - `options`: an object containing the following properties:
   - `transformFn` (optional): a function that passes an object as its only argument. It contains the raw HTML/Markdown chunk, tag name, parsed content and html attributes.
+    If the function adds a `additionalProperties` object to the trasformed node, it will be merged with the original node's properties.
   - `mergeStrategy` (optional): a value that defines how to handle consecutive chunks of the same tag. The default value is `merge`. Accepted values are:
     - `merge`: consecutive chunks with the same tag will be merged into one document for the index.
     - `split`: consecutive chunks with the same tag will be split into separate documents for the index.

diff --git a/packages/plugin-docusaurus/package.json b/packages/plugin-docusaurus/package.json
@@ -51,6 +51,7 @@
     "@orama/orama": "workspace:*",
     "@orama/plugin-match-highlight": "workspace:*",
     "@orama/plugin-parsedoc": "workspace:*",
+    "github-slugger": "^2.0.0",
     "pako": "^2.1.0",
     "vfile-message": "^3.1.3"
   },

diff --git a/packages/plugin-docusaurus/src/server/index.ts b/packages/plugin-docusaurus/src/server/index.ts
@@ -3,8 +3,9 @@ import type { LoadContext, Plugin } from '@docusaurus/types'
 import { create, insertMultiple, save } from '@orama/orama'
 import { documentsStore } from '@orama/orama/components'
 import { OramaWithHighlight, afterInsert as highlightAfterInsert } from '@orama/plugin-match-highlight'
-import type { DefaultSchemaElement, NodeContent } from '@orama/plugin-parsedoc'
+import type { DefaultSchemaElement, NodeContent, PopulateFnContext } from '@orama/plugin-parsedoc'
 import { defaultHtmlSchema, populate } from '@orama/plugin-parsedoc'
+import * as githubSlugger from 'github-slugger'
 import { cp, readFile, writeFile } from 'node:fs/promises'
 import { resolve } from 'node:path'
 import { fileURLToPath } from 'node:url'
@@ -23,7 +24,9 @@ function indexPath(outDir: string, version: string): string {
   return resolve(outDir, INDEX_FILE.replace('@VERSION@', version))
 }
 
-function transformFn(node: NodeContent): NodeContent {
+function transformFn(node: NodeContent, context: PopulateFnContext): NodeContent {
+  let raw
+
   switch (node.tag) {
     case 'strong':
     case 'a':
@@ -34,10 +37,30 @@ function transformFn(node: NodeContent): NodeContent {
     case 'b':
     case 'p':
     case 'ul':
-      return { ...node, raw: `<p>${node.content}</p>` }
-    default:
-      return node
+      raw = `<p>${node.content}</p>`
+      break
+    case 'h1':
+    case 'h2':
+    case 'h3':
+    case 'h4':
+    case 'h5':
+    case 'h6':
+      context.lastLink = node.properties?.id ?? githubSlugger.slug(node.content)
+      break
   }
+
+  const transformed = {
+    ...node,
+    additionalProperties: {
+      hash: context.lastLink
+    }
+  }
+
+  if (raw) {
+    transformed.raw = raw
+  }
+
+  return transformed
 }
 
 function defaultToSectionSchema(
@@ -46,7 +69,7 @@ function defaultToSectionSchema(
   sectionTitle: string,
   version: string
 ): SectionSchema {
-  const { content, type } = node
+  const { content, type, properties } = node
 
   if (!sectionTitle) {
     sectionTitle = (pageRoute.split('/').pop() ?? '')
@@ -58,6 +81,7 @@ function defaultToSectionSchema(
 
   return {
     pageRoute,
+    hash: (properties?.hash as string) ?? '',
     sectionTitle: pageRoute ? sectionTitle : 'Home',
     sectionContent: content,
     type,
@@ -92,6 +116,10 @@ async function generateDocument(
     if (!section.pageRoute.startsWith('/')) {
       section.pageRoute = '/' + section.pageRoute
     }
+
+    if (section.hash) {
+      section.pageRoute += `#${section.hash}`
+    }
   }
 
   return sections

diff --git a/packages/plugin-docusaurus/src/server/types.ts b/packages/plugin-docusaurus/src/server/types.ts
@@ -14,6 +14,7 @@ export interface SectionSchema extends Document {
   pageRoute: string
   sectionTitle: string
   version: string
+  hash: string
 }
 
 export type RawDataWithPositions = RawData & { positions: Record<string, Record<string, Record<string, Position[]>>> }

diff --git a/packages/plugin-docusaurus/test/integration.ts b/packages/plugin-docusaurus/test/integration.ts
@@ -101,20 +101,20 @@ await test('generated DBs have indexed pages content', async () => {
   // Search results seem reasonable
   const indexSearchResult = await search(database, 'index')
   assert.ok(indexSearchResult.count === 1)
-  assert.ok(indexSearchResult.hits[0].document.pageRoute === '/')
+  assert.ok(indexSearchResult.hits[0].document.pageRoute === '/#main')
 
   const catSearchResult = await search(database, 'cat')
   assert.ok(catSearchResult.count === 1)
   assert.ok(catSearchResult.hits[0].document.pageRoute === '/animals_cat')
 
   const dogSearchResult = await search(database, 'dog')
   assert.ok(dogSearchResult.count === 2)
-  assert.ok(dogSearchResult.hits[0].document.pageRoute === '/animals_dog')
+  assert.ok(dogSearchResult.hits[0].document.pageRoute === '/animals_dog#dog')
 
   const domesticSearchResult = await search(database, 'domestic')
   assert.ok(domesticSearchResult.count === 2)
   assert.ok(domesticSearchResult.hits[0].document.pageRoute === '/animals_cat')
-  assert.ok(domesticSearchResult.hits[1].document.pageRoute === '/animals_dog')
+  assert.ok(domesticSearchResult.hits[1].document.pageRoute === '/animals_dog#dog')
 
   // We do not have content about turtles
   const turtleSearchResult = await search(database, 'turtle')

diff --git a/packages/plugin-parsedoc/src/index.ts b/packages/plugin-parsedoc/src/index.ts
@@ -29,9 +29,12 @@ export interface DefaultSchemaElement extends Document {
   properties?: Properties
 }
 
+export type PopulateFnContext = Record<string, any>
+
 interface PopulateFromGlobOptions {
   transformFn?: TransformFn
   mergeStrategy?: MergeStrategy
+  context?: PopulateFnContext
 }
 
 type PopulateOptions = PopulateFromGlobOptions & { basePath?: string }
@@ -54,12 +57,11 @@ const populateFromFile = async (db: Orama, filename: string, options?: PopulateF
   return populate(db, data, fileType, { ...options, basePath: `${filename}/` })
 }
 
-export const populate = async (
-  db: Orama,
+export const parseFile = async (
   data: Buffer | string,
   fileType: FileType,
   options?: PopulateOptions
-): Promise<string[]> => {
+): Promise<DefaultSchemaElement[]> => {
   const records: DefaultSchemaElement[] = []
   switch (fileType) {
     case 'md':
@@ -80,13 +82,34 @@ export const populate = async (
       return fileType
     /* c8 ignore stop */
   }
-  return insertMultiple(db, records)
+
+  return records
+}
+
+export const populate = async (
+  db: Orama,
+  data: Buffer | string,
+  fileType: FileType,
+  options?: PopulateOptions
+): Promise<string[]> => {
+  return insertMultiple(db, await parseFile(data, fileType, options))
 }
 
 function rehypeOrama(records: DefaultSchemaElement[], options?: PopulateOptions): (tree: Root) => void {
+  if (!options) {
+    options = {}
+  }
+
   return (tree: Root) => {
     tree.children.forEach((child, i) => {
-      visitChildren(child, tree, `${options?.basePath /* c8 ignore next */ ?? ''}root[${i}]`, records, options)
+      visitChildren(
+        child,
+        tree,
+        `${options?.basePath /* c8 ignore next */ ?? ''}root[${i}]`,
+        records,
+        options!,
+        structuredClone(options?.context ?? {})
+      )
     })
   }
 }
@@ -96,7 +119,8 @@ function visitChildren(
   parent: Parent,
   path: string,
   records: DefaultSchemaElement[],
-  options?: PopulateOptions
+  options: PopulateOptions,
+  context: PopulateFnContext
 ): void {
   if (node.type === 'text') {
     addRecords(
@@ -105,23 +129,24 @@ function visitChildren(
       path,
       (parent as Element).properties,
       records,
-      options?.mergeStrategy ?? 'merge'
+      options.mergeStrategy ?? 'merge'
     )
     return
   }
 
   if (!('tagName' in node)) return
 
-  const transformedNode = typeof options?.transformFn === 'function' ? applyTransform(node, options.transformFn) : node
+  const transformedNode =
+    typeof options?.transformFn === 'function' ? applyTransform(node, options.transformFn, context) : node
 
   transformedNode.children.forEach((child, i) => {
-    visitChildren(child, transformedNode, `${path}.${transformedNode.tagName}[${i}]`, records, options)
+    visitChildren(child, transformedNode, `${path}.${transformedNode.tagName}[${i}]`, records, options, context)
   })
 }
 
-function applyTransform(node: Element, transformFn: TransformFn): Element {
+function applyTransform(node: Element, transformFn: TransformFn, context: PopulateFnContext): Element {
   const preparedNode = prepareNode(node)
-  const transformedNode = transformFn(preparedNode)
+  const transformedNode = transformFn(preparedNode, context)
   return applyChanges(node, transformedNode)
 }
 
@@ -134,14 +159,19 @@ function prepareNode(node: Element): NodeContent {
 }
 
 function applyChanges(node: Element, transformedNode: NodeContent): Element {
+  let changed = node
+
   if (toHtml(node) !== transformedNode.raw) {
-    return fromHtml(transformedNode.raw, { fragment: true }).children[0] as Element
+    changed = fromHtml(transformedNode.raw, { fragment: true }).children[0] as Element
+  } else {
+    node.tagName = transformedNode.tag
+    if (toString(node) !== transformedNode.content) {
+      changed = fromString(node, transformedNode.content)
+    }
   }
-  node.tagName = transformedNode.tag
-  if (toString(node) !== transformedNode.content) {
-    return fromString(node, transformedNode.content)
-  }
-  return node
+
+  changed.properties = { ...changed.properties, ...transformedNode.additionalProperties }
+  return changed
 }
 
 function addRecords(
@@ -203,6 +233,7 @@ export interface NodeContent {
   raw: string
   content: string
   properties?: Properties
+  additionalProperties?: Properties
 }
 
-export type TransformFn = (node: NodeContent) => NodeContent
+export type TransformFn = (node: NodeContent, context: PopulateFnContext) => NodeContent
diff --git a/packages/stemmers/package.json b/packages/stemmers/package.json
@@ -193,4 +193,4 @@
   "engines": {
     "node": ">= 16.0.0"
   }
-}
+}
diff --git a/packages/stopwords/package.json b/packages/stopwords/package.json
@@ -186,4 +186,4 @@
   "engines": {
     "node": ">= 16.0.0"
   }
-}
+}
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml