Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Separate markdown into headings and paragraphs #9173

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
55318b5
sprit markdown by token count
NaokiHigashi28 Sep 25, 2024
43e88ef
change to async function
NaokiHigashi28 Sep 25, 2024
7fe49b9
remove comment
NaokiHigashi28 Sep 25, 2024
2e8ee0c
change
NaokiHigashi28 Sep 25, 2024
6c839ee
change
NaokiHigashi28 Sep 25, 2024
bf07ed4
change directory
NaokiHigashi28 Sep 25, 2024
5f43036
create package
NaokiHigashi28 Sep 27, 2024
b5d13c4
add dependencies
NaokiHigashi28 Sep 30, 2024
e9d90a8
delete test
NaokiHigashi28 Sep 30, 2024
a59c4f4
add module
NaokiHigashi28 Sep 30, 2024
471b770
change module name
NaokiHigashi28 Sep 30, 2024
2ac50d5
add test
NaokiHigashi28 Sep 30, 2024
6788cca
use langchain
NaokiHigashi28 Oct 1, 2024
a13e44a
change module name
NaokiHigashi28 Oct 1, 2024
bb1c509
fix
NaokiHigashi28 Oct 1, 2024
520d47e
Merge branch 'feat/openai-vector-searching' into feat/153983-154087-s…
NaokiHigashi28 Oct 1, 2024
4c1de97
use markdown
NaokiHigashi28 Oct 3, 2024
2d22213
typescriptize test
yuki-takei Oct 3, 2024
372b28f
add vitest configuration file
yuki-takei Oct 3, 2024
e354136
Merge pull request #9194 from weseek/imprv/159383-vitest-environment
yuki-takei Oct 3, 2024
f7daad7
delete dependencies
NaokiHigashi28 Oct 4, 2024
527502a
delete dependencies
NaokiHigashi28 Oct 4, 2024
5d6fe91
mark down splitt
NaokiHigashi28 Oct 4, 2024
70a93b8
add test
NaokiHigashi28 Oct 4, 2024
08cbf52
remove @dqbd/tiktoken
NaokiHigashi28 Oct 7, 2024
df2f8ed
remove japanese comment
NaokiHigashi28 Oct 7, 2024
430251a
change attribute name
NaokiHigashi28 Oct 7, 2024
324b56e
convert original yarn.lock
NaokiHigashi28 Oct 7, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .changeset/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"@growi/app",
"@growi/slackbot-proxy",
"@growi/custom-icons",
"@growi/markdown-splitter",
"@growi/editor",
"@growi/presentation",
"@growi/preset-*",
Expand Down
1 change: 1 addition & 0 deletions apps/app/next.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ const getTranspilePackages = () => {
const optimizePackageImports = [
'@growi/core',
'@growi/editor',
'@growi/markdown-splitter',
'@growi/pluginkit',
'@growi/presentation',
'@growi/preset-themes',
Expand Down
1 change: 1 addition & 0 deletions apps/app/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@
"@growi/core-styles": "link:../../packages/core-styles",
"@growi/custom-icons": "link:../../packages/custom-icons",
"@growi/editor": "link:../../packages/editor",
"@growi/markdown-splitter": "link:../../packages/markdown-splitter",
"@growi/ui": "link:../../packages/ui",
"@handsontable/react": "=2.1.0",
"@next/bundle-analyzer": "^14.1.3",
Expand Down
2 changes: 2 additions & 0 deletions packages/markdown-splitter/.eslintignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/dist/**
/types/**
5 changes: 5 additions & 0 deletions packages/markdown-splitter/.eslintrc.cjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
module.exports = {
extends: [
'weseek/react',
],
};
1 change: 1 addition & 0 deletions packages/markdown-splitter/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/dist
43 changes: 43 additions & 0 deletions packages/markdown-splitter/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
"name": "@growi/markdown-splitter",
"version": "1.0.0",
"license": "MIT",
"private": "true",
"type": "module",
"module": "dist/index.js",
"types": "dist/index.d.ts",
"files": [
"dist"
],
"main": "dist/index.cjs",
"exports": {
".": {
"import": "./dist/index.js",
"require": "./dist/index.cjs"
}
},
"scripts": {
"build": "vite build",
"clean": "shx rm -rf dist",
"dev": "vite build --mode dev",
"watch": "yarn dev -w --emptyOutDir=false",
"lint:js": "yarn eslint **/*.{js,ts}",
"lint:typecheck": "tsc",
"lint": "npm-run-all -p lint:*",
"test": "vitest run --coverage"
},
"devDependencies": {
"eslint-plugin-regex": "^1.8.0",
"hast-util-sanitize": "^4.1.0",
"pako": "^2.1.0",
"throttle-debounce": "^5.0.0",
"unified": "^10.1.2",
"unist-util-visit": "^4.0.0"
},
"peerDependencies": {
"react": "^18.2.0",
"react-dom": "^18.2.0"
},
"dependencies": {
}
}
1 change: 1 addition & 0 deletions packages/markdown-splitter/src/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
export * from './services/markdown-splitter';
106 changes: 106 additions & 0 deletions packages/markdown-splitter/src/services/markdown-splitter.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
export type Chunk = {
label: string;
text: string;
};

/**
* Processes and adds a new chunk to the chunks array if content is not empty.
* Clears the contentLines array after processing.
* @param chunks - The array to store chunks.
* @param contentLines - The array of content lines.
* @param label - The label for the content chunk.
*/
function processPendingContent(chunks: Chunk[], contentLines: string[], label: string) {
const text = contentLines.join('\n').trimEnd();
if (text !== '') {
chunks.push({ label, text });
}
contentLines.length = 0; // Clear the contentLines array
}

/**
* Updates the section numbers based on the heading depth and returns the updated section label.
* Handles non-consecutive heading levels by initializing missing levels with 1.
* @param sectionNumbers - The current section numbers.
* @param depth - The depth of the heading (e.g., # is depth 1).
* @returns The updated section label.
*/
function updateSectionNumbers(sectionNumbers: number[], depth: number): string {
if (depth > sectionNumbers.length) {
// If depth increases, initialize missing levels with 1
while (sectionNumbers.length < depth) {
sectionNumbers.push(1);
}
}
else if (depth === sectionNumbers.length) {
// Same level, increment the last number
sectionNumbers[depth - 1]++;
}
else {
// Depth decreases, remove deeper levels and increment current level
sectionNumbers.splice(depth);
sectionNumbers[depth - 1]++;
}
return sectionNumbers.join('-');
}

/**
* Splits Markdown text into labeled chunks, considering content that may start before any headers
* and handling non-consecutive heading levels. Preserves list indentation and leading spaces while
* reducing unnecessary line breaks. Ensures that no empty line is added between sections.
* @param markdown - The input Markdown string.
* @returns An array of labeled chunks.
*/
export function splitMarkdownIntoChunks(markdown: string): Chunk[] {
const chunks: Chunk[] = [];
const sectionNumbers: number[] = [];

if (typeof markdown !== 'string' || markdown.trim() === '') {
return chunks;
}

const lines = markdown.split('\n');
const contentLines: string[] = [];
let currentLabel = '';
let previousLineEmpty = false;

for (const line of lines) {
const trimmedLine = line.trim();

if (trimmedLine.startsWith('#')) {
// Process any pending content before starting a new section
if (contentLines.length > 0) {
const contentLabel = currentLabel !== '' ? `${currentLabel}-content` : '0-content';
processPendingContent(chunks, contentLines, contentLabel);
}

// Match heading level and text
const headerMatch = trimmedLine.match(/^(#+)\s+(.*)/);
if (headerMatch) {
const headingDepth = headerMatch[1].length;
currentLabel = updateSectionNumbers(sectionNumbers, headingDepth);
chunks.push({ label: `${currentLabel}-heading`, text: line });
}
}
else if (trimmedLine === '') {
// Handle empty lines to avoid multiple consecutive empty lines
if (!previousLineEmpty && contentLines.length > 0) {
contentLines.push('');
previousLineEmpty = true;
}
}
else {
// Add non-empty lines to the current content
contentLines.push(line);
previousLineEmpty = false;
}
}

// Process any remaining content after the last line
if (contentLines.length > 0) {
const contentLabel = currentLabel !== '' ? `${currentLabel}-content` : '0-content';
processPendingContent(chunks, contentLines, contentLabel);
}

return chunks;
}
Loading
Loading