Просмотр исходного кода

Merge pull request #9173 from weseek/feat/153983-154087-split-markdown-per-header-sections

feat: Separate markdown into headings and paragraphs
mergify[bot] 1 год назад
Родитель
Сommit
cb56e0c785

+ 1 - 0
.changeset/config.json

@@ -15,6 +15,7 @@
     "@growi/app",
     "@growi/app",
     "@growi/slackbot-proxy",
     "@growi/slackbot-proxy",
     "@growi/custom-icons",
     "@growi/custom-icons",
+    "@growi/markdown-splitter",
     "@growi/editor",
     "@growi/editor",
     "@growi/presentation",
     "@growi/presentation",
     "@growi/preset-*",
     "@growi/preset-*",

+ 1 - 0
apps/app/next.config.js

@@ -73,6 +73,7 @@ const getTranspilePackages = () => {
 const optimizePackageImports = [
 const optimizePackageImports = [
   '@growi/core',
   '@growi/core',
   '@growi/editor',
   '@growi/editor',
+  '@growi/markdown-splitter',
   '@growi/pluginkit',
   '@growi/pluginkit',
   '@growi/presentation',
   '@growi/presentation',
   '@growi/preset-themes',
   '@growi/preset-themes',

+ 1 - 0
apps/app/package.json

@@ -222,6 +222,7 @@
     "@growi/core-styles": "link:../../packages/core-styles",
     "@growi/core-styles": "link:../../packages/core-styles",
     "@growi/custom-icons": "link:../../packages/custom-icons",
     "@growi/custom-icons": "link:../../packages/custom-icons",
     "@growi/editor": "link:../../packages/editor",
     "@growi/editor": "link:../../packages/editor",
+    "@growi/markdown-splitter": "link:../../packages/markdown-splitter",
     "@growi/ui": "link:../../packages/ui",
     "@growi/ui": "link:../../packages/ui",
     "@handsontable/react": "=2.1.0",
     "@handsontable/react": "=2.1.0",
     "@next/bundle-analyzer": "^14.1.3",
     "@next/bundle-analyzer": "^14.1.3",

+ 2 - 0
packages/markdown-splitter/.eslintignore

@@ -0,0 +1,2 @@
+/dist/**
+/types/**

+ 5 - 0
packages/markdown-splitter/.eslintrc.cjs

@@ -0,0 +1,5 @@
+module.exports = {
+  extends: [
+    'weseek/react',
+  ],
+};

+ 1 - 0
packages/markdown-splitter/.gitignore

@@ -0,0 +1 @@
+/dist

+ 43 - 0
packages/markdown-splitter/package.json

@@ -0,0 +1,43 @@
+{
+  "name": "@growi/markdown-splitter",
+  "version": "1.0.0",
+  "license": "MIT",
+  "private": "true",
+  "type": "module",
+  "module": "dist/index.js",
+  "types": "dist/index.d.ts",
+  "files": [
+    "dist"
+  ],
+  "main": "dist/index.cjs",
+  "exports": {
+    ".": {
+      "import": "./dist/index.js",
+      "require": "./dist/index.cjs"
+    }
+  },
+  "scripts": {
+    "build": "vite build",
+    "clean": "shx rm -rf dist",
+    "dev": "vite build --mode dev",
+    "watch": "yarn dev -w --emptyOutDir=false",
+    "lint:js": "yarn eslint **/*.{js,ts}",
+    "lint:typecheck": "tsc",
+    "lint": "npm-run-all -p lint:*",
+    "test": "vitest run --coverage"
+  },
+  "devDependencies": {
+    "eslint-plugin-regex": "^1.8.0",
+    "hast-util-sanitize": "^4.1.0",
+    "pako": "^2.1.0",
+    "throttle-debounce": "^5.0.0",
+    "unified": "^10.1.2",
+    "unist-util-visit": "^4.0.0"
+  },
+  "peerDependencies": {
+    "react": "^18.2.0",
+    "react-dom": "^18.2.0"
+  },
+  "dependencies": {
+  }
+}

+ 1 - 0
packages/markdown-splitter/src/index.ts

@@ -0,0 +1 @@
+export * from './services/markdown-splitter';

+ 106 - 0
packages/markdown-splitter/src/services/markdown-splitter.ts

@@ -0,0 +1,106 @@
+export type Chunk = {
+  label: string;
+  text: string;
+};
+
+/**
+ * Processes and adds a new chunk to the chunks array if content is not empty.
+ * Clears the contentLines array after processing.
+ * @param chunks - The array to store chunks.
+ * @param contentLines - The array of content lines.
+ * @param label - The label for the content chunk.
+ */
+function processPendingContent(chunks: Chunk[], contentLines: string[], label: string) {
+  const text = contentLines.join('\n').trimEnd();
+  if (text !== '') {
+    chunks.push({ label, text });
+  }
+  contentLines.length = 0; // Clear the contentLines array
+}
+
+/**
+ * Updates the section numbers based on the heading depth and returns the updated section label.
+ * Handles non-consecutive heading levels by initializing missing levels with 1.
+ * @param sectionNumbers - The current section numbers.
+ * @param depth - The depth of the heading (e.g., # is depth 1).
+ * @returns The updated section label.
+ */
+function updateSectionNumbers(sectionNumbers: number[], depth: number): string {
+  if (depth > sectionNumbers.length) {
+    // If depth increases, initialize missing levels with 1
+    while (sectionNumbers.length < depth) {
+      sectionNumbers.push(1);
+    }
+  }
+  else if (depth === sectionNumbers.length) {
+    // Same level, increment the last number
+    sectionNumbers[depth - 1]++;
+  }
+  else {
+    // Depth decreases, remove deeper levels and increment current level
+    sectionNumbers.splice(depth);
+    sectionNumbers[depth - 1]++;
+  }
+  return sectionNumbers.join('-');
+}
+
+/**
+ * Splits Markdown text into labeled chunks, considering content that may start before any headers
+ * and handling non-consecutive heading levels. Preserves list indentation and leading spaces while
+ * reducing unnecessary line breaks. Ensures that no empty line is added between sections.
+ * @param markdown - The input Markdown string.
+ * @returns An array of labeled chunks.
+ */
+export function splitMarkdownIntoChunks(markdown: string): Chunk[] {
+  const chunks: Chunk[] = [];
+  const sectionNumbers: number[] = [];
+
+  if (typeof markdown !== 'string' || markdown.trim() === '') {
+    return chunks;
+  }
+
+  const lines = markdown.split('\n');
+  const contentLines: string[] = [];
+  let currentLabel = '';
+  let previousLineEmpty = false;
+
+  for (const line of lines) {
+    const trimmedLine = line.trim();
+
+    if (trimmedLine.startsWith('#')) {
+      // Process any pending content before starting a new section
+      if (contentLines.length > 0) {
+        const contentLabel = currentLabel !== '' ? `${currentLabel}-content` : '0-content';
+        processPendingContent(chunks, contentLines, contentLabel);
+      }
+
+      // Match heading level and text
+      const headerMatch = trimmedLine.match(/^(#+)\s+(.*)/);
+      if (headerMatch) {
+        const headingDepth = headerMatch[1].length;
+        currentLabel = updateSectionNumbers(sectionNumbers, headingDepth);
+        chunks.push({ label: `${currentLabel}-heading`, text: line });
+      }
+    }
+    else if (trimmedLine === '') {
+      // Handle empty lines to avoid multiple consecutive empty lines
+      if (!previousLineEmpty && contentLines.length > 0) {
+        contentLines.push('');
+        previousLineEmpty = true;
+      }
+    }
+    else {
+      // Add non-empty lines to the current content
+      contentLines.push(line);
+      previousLineEmpty = false;
+    }
+  }
+
+  // Process any remaining content after the last line
+  if (contentLines.length > 0) {
+    const contentLabel = currentLabel !== '' ? `${currentLabel}-content` : '0-content';
+    processPendingContent(chunks, contentLines, contentLabel);
+  }
+
+  return chunks;
+}

+ 252 - 0
packages/markdown-splitter/test/index.spec.ts

@@ -0,0 +1,252 @@
+import type { Chunk } from '../src/services/markdown-splitter';
+import { splitMarkdownIntoChunks } from '../src/services/markdown-splitter';
+
+describe('splitMarkdownIntoChunks', () => {
+
+  test('handles empty markdown string', () => {
+    const markdown = '';
+    const expected: Chunk[] = [];
+    const result = splitMarkdownIntoChunks(markdown);
+    expect(result).toEqual(expected);
+  });
+
+  test('handles markdown with only content and no headers', () => {
+    const markdown = `This is some content without any headers.
+It spans multiple lines.
+
+Another paragraph.
+    `;
+    const expected: Chunk[] = [
+      {
+        label: '0-content',
+        text: 'This is some content without any headers.\nIt spans multiple lines.\n\nAnother paragraph.',
+      },
+    ];
+    const result = splitMarkdownIntoChunks(markdown);
+    expect(result).toEqual(expected);
+  });
+
+  test('handles markdown starting with a header', () => {
+    const markdown = `
+# Header 1
+Content under header 1.
+
+## Header 1.1
+Content under header 1.1.
+
+# Header 2
+Content under header 2.
+    `;
+    const expected: Chunk[] = [
+      { label: '1-heading', text: '# Header 1' },
+      { label: '1-content', text: 'Content under header 1.' },
+      { label: '1-1-heading', text: '## Header 1.1' },
+      { label: '1-1-content', text: 'Content under header 1.1.' },
+      { label: '2-heading', text: '# Header 2' },
+      { label: '2-content', text: 'Content under header 2.' },
+    ];
+    const result = splitMarkdownIntoChunks(markdown);
+    expect(result).toEqual(expected);
+  });
+
+  test('handles markdown with non-consecutive heading levels', () => {
+    const markdown = `
+Introduction without a header.
+
+# Chapter 1
+Content of chapter 1.
+
+### Section 1.1.1
+Content of section 1.1.1.
+
+## Section 1.2
+Content of section 1.2.
+
+# Chapter 2
+Content of chapter 2.
+
+## Section 2.1
+Content of section 2.1.
+    `;
+    const expected: Chunk[] = [
+      {
+        label: '0-content',
+        text: 'Introduction without a header.',
+      },
+      {
+        label: '1-heading',
+        text: '# Chapter 1',
+      },
+      {
+        label: '1-content',
+        text: 'Content of chapter 1.',
+      },
+      {
+        label: '1-1-1-heading',
+        text: '### Section 1.1.1',
+      },
+      {
+        label: '1-1-1-content',
+        text: 'Content of section 1.1.1.',
+      },
+      {
+        label: '1-2-heading',
+        text: '## Section 1.2',
+      },
+      {
+        label: '1-2-content',
+        text: 'Content of section 1.2.',
+      },
+      {
+        label: '2-heading',
+        text: '# Chapter 2',
+      },
+      {
+        label: '2-content',
+        text: 'Content of chapter 2.',
+      },
+      {
+        label: '2-1-heading',
+        text: '## Section 2.1',
+      },
+      {
+        label: '2-1-content',
+        text: 'Content of section 2.1.',
+      },
+    ];
+    const result = splitMarkdownIntoChunks(markdown);
+    expect(result).toEqual(expected);
+  });
+
+  test('handles markdown with skipped heading levels', () => {
+    const markdown = `
+# Header 1
+Content under header 1.
+
+#### Header 1.1.1.1
+Content under header 1.1.1.1.
+
+## Header 1.2
+Content under header 1.2.
+
+# Header 2
+Content under header 2.
+    `;
+    const expected: Chunk[] = [
+      { label: '1-heading', text: '# Header 1' },
+      { label: '1-content', text: 'Content under header 1.' },
+      { label: '1-1-1-1-heading', text: '#### Header 1.1.1.1' },
+      { label: '1-1-1-1-content', text: 'Content under header 1.1.1.1.' },
+      { label: '1-2-heading', text: '## Header 1.2' },
+      { label: '1-2-content', text: 'Content under header 1.2.' },
+      { label: '2-heading', text: '# Header 2' },
+      { label: '2-content', text: 'Content under header 2.' },
+    ];
+    const result = splitMarkdownIntoChunks(markdown);
+    expect(result).toEqual(expected);
+  });
+
+  test('handles malformed headings', () => {
+    const markdown = `
+# Header 1
+Content under header 1.
+
+#### Header 1.1.1.1
+Content under header 1.1.1.1.
+    `;
+    const expected: Chunk[] = [
+      { label: '1-heading', text: '# Header 1' },
+      { label: '1-content', text: 'Content under header 1.' },
+      { label: '1-1-1-1-heading', text: '#### Header 1.1.1.1' },
+      { label: '1-1-1-1-content', text: 'Content under header 1.1.1.1.' },
+    ];
+    const result = splitMarkdownIntoChunks(markdown);
+    expect(result).toEqual(expected);
+  });
+
+  test('handles multiple content blocks before any headers', () => {
+    const markdown = `
+This is the first paragraph without a header.
+
+This is the second paragraph without a header.
+
+# Header 1
+Content under header 1.
+    `;
+    const expected: Chunk[] = [
+      {
+        label: '0-content',
+        text: 'This is the first paragraph without a header.\n\nThis is the second paragraph without a header.',
+      },
+      { label: '1-heading', text: '# Header 1' },
+      { label: '1-content', text: 'Content under header 1.' },
+    ];
+    const result = splitMarkdownIntoChunks(markdown);
+    expect(result).toEqual(expected);
+  });
+
+  test('handles markdown with only headers and no content', () => {
+    const markdown = `
+# Header 1
+
+## Header 1.1
+
+### Header 1.1.1
+    `;
+    const expected: Chunk[] = [
+      { label: '1-heading', text: '# Header 1' },
+      { label: '1-1-heading', text: '## Header 1.1' },
+      { label: '1-1-1-heading', text: '### Header 1.1.1' },
+    ];
+    const result = splitMarkdownIntoChunks(markdown);
+    expect(result).toEqual(expected);
+  });
+
+  test('handles markdown with mixed content and headers', () => {
+    const markdown = `
+# Header 1
+Content under header 1.
+
+## Header 1.1
+Content under header 1.1.
+Another piece of content.
+
+# Header 2
+Content under header 2.
+    `;
+    const expected: Chunk[] = [
+      { label: '1-heading', text: '# Header 1' },
+      { label: '1-content', text: 'Content under header 1.' },
+      { label: '1-1-heading', text: '## Header 1.1' },
+      { label: '1-1-content', text: 'Content under header 1.1.\nAnother piece of content.' },
+      { label: '2-heading', text: '# Header 2' },
+      { label: '2-content', text: 'Content under header 2.' },
+    ];
+    const result = splitMarkdownIntoChunks(markdown);
+    expect(result).toEqual(expected);
+  });
+
+  test('preserves list indentation and reduces unnecessary line breaks', () => {
+    const markdown = `
+# Header 1
+Content under header 1.
+
+- Item 1
+  - Subitem 1
+- Item 2
+
+
+# Header 2
+Content under header 2.
+    `;
+    const expected: Chunk[] = [
+      { label: '1-heading', text: '# Header 1' },
+      { label: '1-content', text: 'Content under header 1.\n\n- Item 1\n  - Subitem 1\n- Item 2' },
+      { label: '2-heading', text: '# Header 2' },
+      { label: '2-content', text: 'Content under header 2.' },
+    ];
+    const result = splitMarkdownIntoChunks(markdown);
+    expect(result).toEqual(expected);
+  });
+
+});

+ 16 - 0
packages/markdown-splitter/tsconfig.json

@@ -0,0 +1,16 @@
+{
+  "$schema": "http://json.schemastore.org/tsconfig",
+  "extends": "../../tsconfig.base.json",
+  "compilerOptions": {
+    "baseUrl": ".",
+    "paths": {
+      "~/*": ["./src/*"]
+    },
+    "types": [
+      "vitest/globals"
+    ]
+  },
+  "include": [
+    "src", "test"
+  ]
+}

+ 39 - 0
packages/markdown-splitter/vite.config.ts

@@ -0,0 +1,39 @@
+import path from 'path';
+
+import glob from 'glob';
+import { nodeExternals } from 'rollup-plugin-node-externals';
+import { defineConfig } from 'vite';
+import dts from 'vite-plugin-dts';
+
+// https://vitejs.dev/config/
+export default defineConfig({
+  plugins: [
+    dts({
+      copyDtsFiles: true,
+    }),
+    {
+      ...nodeExternals({
+        devDeps: true,
+        builtinsPrefix: 'ignore',
+      }),
+      enforce: 'pre',
+    },
+  ],
+  build: {
+    outDir: 'dist',
+    sourcemap: true,
+    lib: {
+      entry: glob.sync(path.resolve(__dirname, 'src/**/*.ts'), {
+        ignore: '**/*.spec.ts',
+      }),
+      name: 'core-libs',
+      formats: ['es', 'cjs'],
+    },
+    rollupOptions: {
+      output: {
+        preserveModules: true,
+        preserveModulesRoot: 'src',
+      },
+    },
+  },
+});

+ 25 - 0
packages/markdown-splitter/vitest.config.ts

@@ -0,0 +1,25 @@
+import tsconfigPaths from 'vite-tsconfig-paths';
+import { defineConfig, coverageConfigDefaults } from 'vitest/config';
+
+export default defineConfig({
+  plugins: [
+    tsconfigPaths(),
+  ],
+  test: {
+    environment: 'node',
+    clearMocks: true,
+    globals: true,
+    coverage: {
+      exclude: [
+        ...coverageConfigDefaults.exclude,
+        'src/**/index.ts',
+      ],
+      thresholds: {
+        statements: 100,
+        branches: 100,
+        lines: 100,
+        functions: 100,
+      },
+    },
+  },
+});

+ 3 - 0
yarn.lock

@@ -2157,6 +2157,9 @@
     react "^18.2.0"
     react "^18.2.0"
     react-dom "^18.2.0"
     react-dom "^18.2.0"
 
 
+"@growi/markdown-splitter@link:packages/markdown-splitter":
+  version "1.0.0"
+
 "@growi/pluginkit@link:packages/pluginkit":
 "@growi/pluginkit@link:packages/pluginkit":
   version "1.0.1"
   version "1.0.1"
   dependencies:
   dependencies: