diff --git a/.changeset/config.json b/.changeset/config.json index 81aed512429..ed7a3ea2d01 100644 --- a/.changeset/config.json +++ b/.changeset/config.json @@ -15,6 +15,7 @@ "@growi/app", "@growi/slackbot-proxy", "@growi/custom-icons", + "@growi/markdown-splitter", "@growi/editor", "@growi/presentation", "@growi/preset-*", diff --git a/apps/app/next.config.js b/apps/app/next.config.js index 52d73a1a211..a3feebb1fb2 100644 --- a/apps/app/next.config.js +++ b/apps/app/next.config.js @@ -65,6 +65,7 @@ const getTranspilePackages = () => { const optimizePackageImports = [ '@growi/core', '@growi/editor', + '@growi/markdown-splitter', '@growi/pluginkit', '@growi/presentation', '@growi/preset-themes', diff --git a/apps/app/package.json b/apps/app/package.json index 297ee5c051b..cf6e81cc9f1 100644 --- a/apps/app/package.json +++ b/apps/app/package.json @@ -222,6 +222,7 @@ "@growi/core-styles": "link:../../packages/core-styles", "@growi/custom-icons": "link:../../packages/custom-icons", "@growi/editor": "link:../../packages/editor", + "@growi/markdown-splitter": "link:../../packages/markdown-splitter", "@growi/ui": "link:../../packages/ui", "@handsontable/react": "=2.1.0", "@next/bundle-analyzer": "^14.1.3", diff --git a/packages/markdown-splitter/.eslintignore b/packages/markdown-splitter/.eslintignore new file mode 100644 index 00000000000..b91fe2afeaf --- /dev/null +++ b/packages/markdown-splitter/.eslintignore @@ -0,0 +1,2 @@ +/dist/** +/types/** diff --git a/packages/markdown-splitter/.eslintrc.cjs b/packages/markdown-splitter/.eslintrc.cjs new file mode 100644 index 00000000000..dc418225bdd --- /dev/null +++ b/packages/markdown-splitter/.eslintrc.cjs @@ -0,0 +1,5 @@ +module.exports = { + extends: [ + 'weseek/react', + ], +}; diff --git a/packages/markdown-splitter/.gitignore b/packages/markdown-splitter/.gitignore new file mode 100644 index 00000000000..9b1c8b133c9 --- /dev/null +++ b/packages/markdown-splitter/.gitignore @@ -0,0 +1 @@ +/dist diff --git a/packages/markdown-splitter/package.json b/packages/markdown-splitter/package.json new file mode 100644 index 00000000000..45516fc5103 --- /dev/null +++ b/packages/markdown-splitter/package.json @@ -0,0 +1,43 @@ +{ + "name": "@growi/markdown-splitter", + "version": "1.0.0", + "license": "MIT", + "private": "true", + "type": "module", + "module": "dist/index.js", + "types": "dist/index.d.ts", + "files": [ + "dist" + ], + "main": "dist/index.cjs", + "exports": { + ".": { + "import": "./dist/index.js", + "require": "./dist/index.cjs" + } + }, + "scripts": { + "build": "vite build", + "clean": "shx rm -rf dist", + "dev": "vite build --mode dev", + "watch": "yarn dev -w --emptyOutDir=false", + "lint:js": "yarn eslint **/*.{js,ts}", + "lint:typecheck": "tsc", + "lint": "npm-run-all -p lint:*", + "test": "vitest run --coverage" + }, + "devDependencies": { + "eslint-plugin-regex": "^1.8.0", + "hast-util-sanitize": "^4.1.0", + "pako": "^2.1.0", + "throttle-debounce": "^5.0.0", + "unified": "^10.1.2", + "unist-util-visit": "^4.0.0" + }, + "peerDependencies": { + "react": "^18.2.0", + "react-dom": "^18.2.0" + }, + "dependencies": { + } +} diff --git a/packages/markdown-splitter/src/index.ts b/packages/markdown-splitter/src/index.ts new file mode 100644 index 00000000000..2163070b86e --- /dev/null +++ b/packages/markdown-splitter/src/index.ts @@ -0,0 +1 @@ +export * from './services/markdown-splitter'; diff --git a/packages/markdown-splitter/src/services/markdown-splitter.ts b/packages/markdown-splitter/src/services/markdown-splitter.ts new file mode 100644 index 00000000000..ced33ae46bc --- /dev/null +++ b/packages/markdown-splitter/src/services/markdown-splitter.ts @@ -0,0 +1,106 @@ +export type Chunk = { + label: string; + text: string; +}; + +/** + * Processes and adds a new chunk to the chunks array if content is not empty. + * Clears the contentLines array after processing. + * @param chunks - The array to store chunks. + * @param contentLines - The array of content lines. + * @param label - The label for the content chunk. + */ +function processPendingContent(chunks: Chunk[], contentLines: string[], label: string) { + const text = contentLines.join('\n').trimEnd(); + if (text !== '') { + chunks.push({ label, text }); + } + contentLines.length = 0; // Clear the contentLines array +} + +/** + * Updates the section numbers based on the heading depth and returns the updated section label. + * Handles non-consecutive heading levels by initializing missing levels with 1. + * @param sectionNumbers - The current section numbers. + * @param depth - The depth of the heading (e.g., # is depth 1). + * @returns The updated section label. + */ +function updateSectionNumbers(sectionNumbers: number[], depth: number): string { + if (depth > sectionNumbers.length) { + // If depth increases, initialize missing levels with 1 + while (sectionNumbers.length < depth) { + sectionNumbers.push(1); + } + } + else if (depth === sectionNumbers.length) { + // Same level, increment the last number + sectionNumbers[depth - 1]++; + } + else { + // Depth decreases, remove deeper levels and increment current level + sectionNumbers.splice(depth); + sectionNumbers[depth - 1]++; + } + return sectionNumbers.join('-'); +} + +/** + * Splits Markdown text into labeled chunks, considering content that may start before any headers + * and handling non-consecutive heading levels. Preserves list indentation and leading spaces while + * reducing unnecessary line breaks. Ensures that no empty line is added between sections. + * @param markdown - The input Markdown string. + * @returns An array of labeled chunks. + */ +export function splitMarkdownIntoChunks(markdown: string): Chunk[] { + const chunks: Chunk[] = []; + const sectionNumbers: number[] = []; + + if (typeof markdown !== 'string' || markdown.trim() === '') { + return chunks; + } + + const lines = markdown.split('\n'); + const contentLines: string[] = []; + let currentLabel = ''; + let previousLineEmpty = false; + + for (const line of lines) { + const trimmedLine = line.trim(); + + if (trimmedLine.startsWith('#')) { + // Process any pending content before starting a new section + if (contentLines.length > 0) { + const contentLabel = currentLabel !== '' ? `${currentLabel}-content` : '0-content'; + processPendingContent(chunks, contentLines, contentLabel); + } + + // Match heading level and text + const headerMatch = trimmedLine.match(/^(#+)\s+(.*)/); + if (headerMatch) { + const headingDepth = headerMatch[1].length; + currentLabel = updateSectionNumbers(sectionNumbers, headingDepth); + chunks.push({ label: `${currentLabel}-heading`, text: line }); + } + } + else if (trimmedLine === '') { + // Handle empty lines to avoid multiple consecutive empty lines + if (!previousLineEmpty && contentLines.length > 0) { + contentLines.push(''); + previousLineEmpty = true; + } + } + else { + // Add non-empty lines to the current content + contentLines.push(line); + previousLineEmpty = false; + } + } + + // Process any remaining content after the last line + if (contentLines.length > 0) { + const contentLabel = currentLabel !== '' ? `${currentLabel}-content` : '0-content'; + processPendingContent(chunks, contentLines, contentLabel); + } + + return chunks; +} diff --git a/packages/markdown-splitter/test/index.spec.ts b/packages/markdown-splitter/test/index.spec.ts new file mode 100644 index 00000000000..21ab87bdb65 --- /dev/null +++ b/packages/markdown-splitter/test/index.spec.ts @@ -0,0 +1,252 @@ +import type { Chunk } from '../src/services/markdown-splitter'; +import { splitMarkdownIntoChunks } from '../src/services/markdown-splitter'; + +describe('splitMarkdownIntoChunks', () => { + + test('handles empty markdown string', () => { + const markdown = ''; + const expected: Chunk[] = []; + const result = splitMarkdownIntoChunks(markdown); + expect(result).toEqual(expected); + }); + + test('handles markdown with only content and no headers', () => { + const markdown = `This is some content without any headers. +It spans multiple lines. + +Another paragraph. + `; + const expected: Chunk[] = [ + { + label: '0-content', + text: 'This is some content without any headers.\nIt spans multiple lines.\n\nAnother paragraph.', + }, + ]; + const result = splitMarkdownIntoChunks(markdown); + expect(result).toEqual(expected); + }); + + test('handles markdown starting with a header', () => { + const markdown = ` +# Header 1 +Content under header 1. + +## Header 1.1 +Content under header 1.1. + +# Header 2 +Content under header 2. + `; + const expected: Chunk[] = [ + { label: '1-heading', text: '# Header 1' }, + { label: '1-content', text: 'Content under header 1.' }, + { label: '1-1-heading', text: '## Header 1.1' }, + { label: '1-1-content', text: 'Content under header 1.1.' }, + { label: '2-heading', text: '# Header 2' }, + { label: '2-content', text: 'Content under header 2.' }, + ]; + const result = splitMarkdownIntoChunks(markdown); + expect(result).toEqual(expected); + }); + + test('handles markdown with non-consecutive heading levels', () => { + const markdown = ` +Introduction without a header. + +# Chapter 1 +Content of chapter 1. + +### Section 1.1.1 +Content of section 1.1.1. + +## Section 1.2 +Content of section 1.2. + +# Chapter 2 +Content of chapter 2. + +## Section 2.1 +Content of section 2.1. + `; + const expected: Chunk[] = [ + { + label: '0-content', + text: 'Introduction without a header.', + }, + { + label: '1-heading', + text: '# Chapter 1', + }, + { + label: '1-content', + text: 'Content of chapter 1.', + }, + { + label: '1-1-1-heading', + text: '### Section 1.1.1', + }, + { + label: '1-1-1-content', + text: 'Content of section 1.1.1.', + }, + { + label: '1-2-heading', + text: '## Section 1.2', + }, + { + label: '1-2-content', + text: 'Content of section 1.2.', + }, + { + label: '2-heading', + text: '# Chapter 2', + }, + { + label: '2-content', + text: 'Content of chapter 2.', + }, + { + label: '2-1-heading', + text: '## Section 2.1', + }, + { + label: '2-1-content', + text: 'Content of section 2.1.', + }, + ]; + const result = splitMarkdownIntoChunks(markdown); + expect(result).toEqual(expected); + }); + + test('handles markdown with skipped heading levels', () => { + const markdown = ` +# Header 1 +Content under header 1. + +#### Header 1.1.1.1 +Content under header 1.1.1.1. + +## Header 1.2 +Content under header 1.2. + +# Header 2 +Content under header 2. + `; + const expected: Chunk[] = [ + { label: '1-heading', text: '# Header 1' }, + { label: '1-content', text: 'Content under header 1.' }, + { label: '1-1-1-1-heading', text: '#### Header 1.1.1.1' }, + { label: '1-1-1-1-content', text: 'Content under header 1.1.1.1.' }, + { label: '1-2-heading', text: '## Header 1.2' }, + { label: '1-2-content', text: 'Content under header 1.2.' }, + { label: '2-heading', text: '# Header 2' }, + { label: '2-content', text: 'Content under header 2.' }, + ]; + const result = splitMarkdownIntoChunks(markdown); + expect(result).toEqual(expected); + }); + + test('handles malformed headings', () => { + const markdown = ` +# Header 1 +Content under header 1. + +#### Header 1.1.1.1 +Content under header 1.1.1.1. + `; + const expected: Chunk[] = [ + { label: '1-heading', text: '# Header 1' }, + { label: '1-content', text: 'Content under header 1.' }, + { label: '1-1-1-1-heading', text: '#### Header 1.1.1.1' }, + { label: '1-1-1-1-content', text: 'Content under header 1.1.1.1.' }, + ]; + const result = splitMarkdownIntoChunks(markdown); + expect(result).toEqual(expected); + }); + + test('handles multiple content blocks before any headers', () => { + const markdown = ` +This is the first paragraph without a header. + +This is the second paragraph without a header. + +# Header 1 +Content under header 1. + `; + const expected: Chunk[] = [ + { + label: '0-content', + text: 'This is the first paragraph without a header.\n\nThis is the second paragraph without a header.', + }, + { label: '1-heading', text: '# Header 1' }, + { label: '1-content', text: 'Content under header 1.' }, + ]; + const result = splitMarkdownIntoChunks(markdown); + expect(result).toEqual(expected); + }); + + test('handles markdown with only headers and no content', () => { + const markdown = ` +# Header 1 + +## Header 1.1 + +### Header 1.1.1 + `; + const expected: Chunk[] = [ + { label: '1-heading', text: '# Header 1' }, + { label: '1-1-heading', text: '## Header 1.1' }, + { label: '1-1-1-heading', text: '### Header 1.1.1' }, + ]; + const result = splitMarkdownIntoChunks(markdown); + expect(result).toEqual(expected); + }); + + test('handles markdown with mixed content and headers', () => { + const markdown = ` +# Header 1 +Content under header 1. + +## Header 1.1 +Content under header 1.1. +Another piece of content. + +# Header 2 +Content under header 2. + `; + const expected: Chunk[] = [ + { label: '1-heading', text: '# Header 1' }, + { label: '1-content', text: 'Content under header 1.' }, + { label: '1-1-heading', text: '## Header 1.1' }, + { label: '1-1-content', text: 'Content under header 1.1.\nAnother piece of content.' }, + { label: '2-heading', text: '# Header 2' }, + { label: '2-content', text: 'Content under header 2.' }, + ]; + const result = splitMarkdownIntoChunks(markdown); + expect(result).toEqual(expected); + }); + + test('preserves list indentation and reduces unnecessary line breaks', () => { + const markdown = ` +# Header 1 +Content under header 1. + +- Item 1 + - Subitem 1 +- Item 2 + + +# Header 2 +Content under header 2. + `; + const expected: Chunk[] = [ + { label: '1-heading', text: '# Header 1' }, + { label: '1-content', text: 'Content under header 1.\n\n- Item 1\n - Subitem 1\n- Item 2' }, + { label: '2-heading', text: '# Header 2' }, + { label: '2-content', text: 'Content under header 2.' }, + ]; + const result = splitMarkdownIntoChunks(markdown); + expect(result).toEqual(expected); + }); + +}); diff --git a/packages/markdown-splitter/tsconfig.json b/packages/markdown-splitter/tsconfig.json new file mode 100644 index 00000000000..f83d70be021 --- /dev/null +++ b/packages/markdown-splitter/tsconfig.json @@ -0,0 +1,16 @@ +{ + "$schema": "http://json.schemastore.org/tsconfig", + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "baseUrl": ".", + "paths": { + "~/*": ["./src/*"] + }, + "types": [ + "vitest/globals" + ] + }, + "include": [ + "src", "test" + ] +} diff --git a/packages/markdown-splitter/vite.config.ts b/packages/markdown-splitter/vite.config.ts new file mode 100644 index 00000000000..951e8d9f1f7 --- /dev/null +++ b/packages/markdown-splitter/vite.config.ts @@ -0,0 +1,39 @@ +import path from 'path'; + +import glob from 'glob'; +import { nodeExternals } from 'rollup-plugin-node-externals'; +import { defineConfig } from 'vite'; +import dts from 'vite-plugin-dts'; + +// https://vitejs.dev/config/ +export default defineConfig({ + plugins: [ + dts({ + copyDtsFiles: true, + }), + { + ...nodeExternals({ + devDeps: true, + builtinsPrefix: 'ignore', + }), + enforce: 'pre', + }, + ], + build: { + outDir: 'dist', + sourcemap: true, + lib: { + entry: glob.sync(path.resolve(__dirname, 'src/**/*.ts'), { + ignore: '**/*.spec.ts', + }), + name: 'core-libs', + formats: ['es', 'cjs'], + }, + rollupOptions: { + output: { + preserveModules: true, + preserveModulesRoot: 'src', + }, + }, + }, +}); diff --git a/packages/markdown-splitter/vitest.config.ts b/packages/markdown-splitter/vitest.config.ts new file mode 100644 index 00000000000..fb40f0b422e --- /dev/null +++ b/packages/markdown-splitter/vitest.config.ts @@ -0,0 +1,25 @@ +import tsconfigPaths from 'vite-tsconfig-paths'; +import { defineConfig, coverageConfigDefaults } from 'vitest/config'; + +export default defineConfig({ + plugins: [ + tsconfigPaths(), + ], + test: { + environment: 'node', + clearMocks: true, + globals: true, + coverage: { + exclude: [ + ...coverageConfigDefaults.exclude, + 'src/**/index.ts', + ], + thresholds: { + statements: 100, + branches: 100, + lines: 100, + functions: 100, + }, + }, + }, +}); diff --git a/yarn.lock b/yarn.lock index 1426867ccbb..80777c475e4 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2189,6 +2189,9 @@ react "^18.2.0" react-dom "^18.2.0" +"@growi/markdown-splitter@link:packages/markdown-splitter": + version "1.0.0" + "@growi/pluginkit@link:packages/pluginkit": version "1.0.1" dependencies: