Просмотр исходного кода

Merge branch 'master' into feat/156430-make-it-clear-whether-the-uploaded-file-is-saved-in-the-vector-store

Shun Miyazawa 1 год назад
Родитель
Сommit
3ca7d34eb0

+ 0 - 1
.changeset/config.json

@@ -15,7 +15,6 @@
     "@growi/app",
     "@growi/slackbot-proxy",
     "@growi/custom-icons",
-    "@growi/markdown-splitter",
     "@growi/editor",
     "@growi/presentation",
     "@growi/preset-templates",

+ 0 - 1
apps/app/next.config.js

@@ -73,7 +73,6 @@ const getTranspilePackages = () => {
 const optimizePackageImports = [
   '@growi/core',
   '@growi/editor',
-  '@growi/markdown-splitter',
   '@growi/pluginkit',
   '@growi/presentation',
   '@growi/preset-themes',

+ 6 - 3
apps/app/package.json

@@ -67,6 +67,7 @@
     "@azure/openai": "^2.0.0-beta.2",
     "@azure/storage-blob": "^12.16.0",
     "@browser-bunyan/console-formatted-stream": "^1.8.0",
+    "@cspell/dynamic-import": "^8.15.4",
     "@elastic/elasticsearch7": "npm:@elastic/elasticsearch@^7.17.0",
     "@elastic/elasticsearch8": "npm:@elastic/elasticsearch@^8.7.0",
     "@godaddy/terminus": "^4.9.0",
@@ -128,14 +129,16 @@
     "i18next-resources-to-backend": "^1.2.1",
     "is-absolute-url": "^4.0.1",
     "is-iso-date": "^0.0.1",
+    "js-tiktoken": "^1.0.15",
+    "js-yaml": "^4.1.0",
     "katex": "^0.16.11",
     "ldapjs": "^3.0.2",
     "lucene-query-parser": "^1.2.0",
     "markdown-table": "^3.0.3",
+    "md5": "^2.2.1",
     "mdast-util-from-markdown": "^2.0.1",
     "mdast-util-gfm-table": "^2.0.0",
     "mdast-util-wiki-link": "^0.1.2",
-    "md5": "^2.2.1",
     "mermaid": "^11.2.0",
     "method-override": "^3.0.0",
     "micromark-extension-gfm-table": "^2.1.0",
@@ -202,6 +205,7 @@
     "remark-math": "^6.0.0",
     "remark-parse": "^11.0.0",
     "remark-rehype": "^11.1.1",
+    "remark-stringify": "^11.0.0",
     "remark-toc": "^9.0.0",
     "sanitize-filename": "^1.6.3",
     "socket.io": "^4.7.5",
@@ -211,8 +215,8 @@
     "swagger-jsdoc": "^6.2.8",
     "swr": "^2.2.2",
     "throttle-debounce": "^5.0.0",
-    "uid-safe": "^2.1.5",
     "uglifycss": "^0.0.29",
+    "uid-safe": "^2.1.5",
     "unified": "^11.0.0",
     "unist-util-visit": "^5.0.0",
     "universal-bunyan": "^0.9.2",
@@ -237,7 +241,6 @@
     "@growi/core-styles": "workspace:^",
     "@growi/custom-icons": "workspace:^",
     "@growi/editor": "workspace:^",
-    "@growi/markdown-splitter": "workspace:^",
     "@growi/ui": "workspace:^",
     "@handsontable/react": "=2.1.0",
     "@next/bundle-analyzer": "^14.1.3",

+ 0 - 0
packages/markdown-splitter/src/services/markdown-splitter.spec.ts → apps/app/src/features/openai/server/services/markdown-splitter/markdown-splitter.spec.ts


+ 14 - 7
packages/markdown-splitter/src/services/markdown-splitter.ts → apps/app/src/features/openai/server/services/markdown-splitter/markdown-splitter.ts

@@ -1,12 +1,13 @@
+import { dynamicImport } from '@cspell/dynamic-import';
 import type { TiktokenModel } from 'js-tiktoken';
 import { encodingForModel } from 'js-tiktoken';
 import yaml from 'js-yaml';
-import remarkFrontmatter from 'remark-frontmatter'; // Frontmatter processing
-import remarkGfm from 'remark-gfm'; // GFM processing
-import remarkParse from 'remark-parse';
-import type { Options as StringifyOptions } from 'remark-stringify';
-import remarkStringify from 'remark-stringify';
-import { unified } from 'unified';
+import type * as RemarkFrontmatter from 'remark-frontmatter';
+import type * as RemarkGfm from 'remark-gfm';
+import type * as RemarkParse from 'remark-parse';
+import type * as RemarkStringify from 'remark-stringify';
+import type * as Unified from 'unified';
+
 
 export type MarkdownFragment = {
   label: string;
@@ -59,12 +60,18 @@ export async function splitMarkdownIntoFragments(markdownText: string, model: Ti
 
   const encoder = encodingForModel(model);
 
+  const remarkParse = (await dynamicImport<typeof RemarkParse>('remark-parse', __dirname)).default;
+  const remarkFrontmatter = (await dynamicImport<typeof RemarkFrontmatter>('remark-frontmatter', __dirname)).default;
+  const remarkGfm = (await dynamicImport<typeof RemarkGfm>('remark-gfm', __dirname)).default;
+  const remarkStringify = (await dynamicImport<typeof RemarkStringify>('remark-stringify', __dirname)).default;
+  const unified = (await dynamicImport<typeof Unified>('unified', __dirname)).unified;
+
   const parser = unified()
     .use(remarkParse)
     .use(remarkFrontmatter, ['yaml'])
     .use(remarkGfm); // Enable GFM extensions
 
-  const stringifyOptions: StringifyOptions = {
+  const stringifyOptions: RemarkStringify.Options = {
     bullet: '-', // Set list bullet to hyphen
     rule: '-', // Use hyphen for horizontal rules
   };

+ 0 - 0
packages/markdown-splitter/src/services/markdown-token-splitter.spec.ts → apps/app/src/features/openai/server/services/markdown-splitter/markdown-token-splitter.spec.ts


+ 2 - 2
packages/markdown-splitter/src/services/markdown-token-splitter.ts → apps/app/src/features/openai/server/services/markdown-splitter/markdown-token-splitter.ts

@@ -105,7 +105,7 @@ export async function splitMarkdownIntoChunks(
 
   // Split markdown text into chunks
   const markdownFragments = await splitMarkdownIntoFragments(markdownText, model);
-  const chunks = [] as string[];
+  const chunks: string[] = [];
 
   // Group the chunks based on token count
   const fragmentGroupes = groupMarkdownFragments(markdownFragments, maxToken);
@@ -162,7 +162,7 @@ export async function splitMarkdownIntoChunks(
             const charCountForSplit = Math.floor((remainingTokenCount / fragmenTokenCount) * fragmentCharCount);
 
             // Split content based on character count
-            const splitContents = [];
+            const splitContents: string[] = [];
             for (let i = 0; i < fragment.text.length; i += charCountForSplit) {
               splitContents.push(fragment.text.slice(i, i + charCountForSplit));
             }

+ 18 - 9
apps/app/src/features/openai/server/services/openai.ts

@@ -21,6 +21,7 @@ import loggerFactory from '~/utils/logger';
 import { OpenaiServiceTypes } from '../../interfaces/ai';
 
 import { getClient } from './client-delegator';
+import { splitMarkdownIntoChunks } from './markdown-splitter/markdown-token-splitter';
 import { oepnaiApiErrorHandler } from './openai-api-error-handler';
 
 const BATCH_SIZE = 100;
@@ -29,6 +30,8 @@ const logger = loggerFactory('growi:service:openai');
 
 let isVectorStoreForPublicScopeExist = false;
 
+type VectorStoreFileRelationsMap = Map<string, VectorStoreFileRelation>
+
 export interface IOpenaiService {
   getOrCreateThread(userId: string, vectorStoreId?: string, threadId?: string): Promise<OpenAI.Beta.Threads.Thread | undefined>;
   getOrCreateVectorStoreForPublicScope(): Promise<VectorStoreDocument>;
@@ -134,26 +137,32 @@ class OpenaiService implements IOpenaiService {
     return newVectorStoreDocument;
   }
 
-  private async uploadFile(pageId: Types.ObjectId, body: string): Promise<OpenAI.Files.FileObject> {
-    const file = await toFile(Readable.from(body), `${pageId}.md`);
-    const uploadedFile = await this.client.uploadFile(file);
-    return uploadedFile;
+  private async uploadFileByChunks(pageId: Types.ObjectId, body: string, vectorStoreFileRelationsMap: VectorStoreFileRelationsMap) {
+    const chunks = await splitMarkdownIntoChunks(body, 'gpt-4o');
+    for await (const [index, chunk] of chunks.entries()) {
+      try {
+        const file = await toFile(Readable.from(chunk), `${pageId}-chunk-${index}.md`);
+        const uploadedFile = await this.client.uploadFile(file);
+        prepareVectorStoreFileRelations(pageId, uploadedFile.id, vectorStoreFileRelationsMap);
+      }
+      catch (err) {
+        logger.error(err);
+      }
+    }
   }
 
   async createVectorStoreFile(pages: Array<HydratedDocument<PageDocument>>): Promise<void> {
-    const vectorStoreFileRelationsMap: Map<string, VectorStoreFileRelation> = new Map();
+    const vectorStoreFileRelationsMap: VectorStoreFileRelationsMap = new Map();
     const processUploadFile = async(page: PageDocument) => {
       if (page._id != null && page.grant === PageGrant.GRANT_PUBLIC && page.revision != null) {
         if (isPopulated(page.revision) && page.revision.body.length > 0) {
-          const uploadedFile = await this.uploadFile(page._id, page.revision.body);
-          prepareVectorStoreFileRelations(page._id, uploadedFile.id, vectorStoreFileRelationsMap);
+          await this.uploadFileByChunks(page._id, page.revision.body, vectorStoreFileRelationsMap);
           return;
         }
 
         const pagePopulatedToShowRevision = await page.populateDataToShowRevision();
         if (pagePopulatedToShowRevision.revision != null && pagePopulatedToShowRevision.revision.body.length > 0) {
-          const uploadedFile = await this.uploadFile(page._id, pagePopulatedToShowRevision.revision.body);
-          prepareVectorStoreFileRelations(page._id, uploadedFile.id, vectorStoreFileRelationsMap);
+          await this.uploadFileByChunks(page._id, pagePopulatedToShowRevision.revision.body, vectorStoreFileRelationsMap);
         }
       }
     };

+ 0 - 2
packages/markdown-splitter/.eslintignore

@@ -1,2 +0,0 @@
-/dist/**
-/types/**

+ 0 - 5
packages/markdown-splitter/.eslintrc.cjs

@@ -1,5 +0,0 @@
-module.exports = {
-  extends: [
-    'weseek/react',
-  ],
-};

+ 0 - 1
packages/markdown-splitter/.gitignore

@@ -1 +0,0 @@
-/dist

+ 0 - 49
packages/markdown-splitter/package.json

@@ -1,49 +0,0 @@
-{
-  "name": "@growi/markdown-splitter",
-  "version": "1.0.0",
-  "license": "MIT",
-  "private": "true",
-  "type": "module",
-  "module": "dist/index.js",
-  "types": "dist/index.d.ts",
-  "files": [
-    "dist"
-  ],
-  "main": "dist/index.cjs",
-  "exports": {
-    ".": {
-      "import": "./dist/index.js",
-      "require": "./dist/index.cjs"
-    }
-  },
-  "scripts": {
-    "build": "vite build",
-    "clean": "shx rm -rf dist",
-    "dev": "vite build --mode dev",
-    "watch": "pnpm run dev -w --emptyOutDir=false",
-    "lint:js": "eslint **/*.{js,ts}",
-    "lint:typecheck": "tsc",
-    "lint": "npm-run-all -p lint:*",
-    "test": "vitest run --coverage"
-  },
-  "dependencies": {
-    "js-tiktoken": "^1.0.15",
-    "js-yaml": "^4.1.0",
-    "remark-frontmatter": "^5.0.0",
-    "remark-gfm": "^4.0.0",
-    "remark-parse": "^11.0.0",
-    "remark-stringify": "^11.0.0",
-    "unified": "^11.0.0"
-  },
-  "devDependencies": {
-    "@types/js-yaml": "^4.0.9",
-    "eslint-plugin-regex": "^1.8.0",
-    "hast-util-sanitize": "^4.1.0",
-    "pako": "^2.1.0",
-    "throttle-debounce": "^5.0.0"
-  },
-  "peerDependencies": {
-    "react": "^18.2.0",
-    "react-dom": "^18.2.0"
-  }
-}

+ 0 - 2
packages/markdown-splitter/src/index.ts

@@ -1,2 +0,0 @@
-export * from './services/markdown-splitter';
-export * from './services/markdown-token-splitter';

+ 0 - 16
packages/markdown-splitter/tsconfig.json

@@ -1,16 +0,0 @@
-{
-  "$schema": "http://json.schemastore.org/tsconfig",
-  "extends": "../../tsconfig.base.json",
-  "compilerOptions": {
-    "baseUrl": ".",
-    "paths": {
-      "~/*": ["./src/*"]
-    },
-    "types": [
-      "vitest/globals"
-    ]
-  },
-  "include": [
-    "src", "test"
-  ]
-}

+ 0 - 39
packages/markdown-splitter/vite.config.ts

@@ -1,39 +0,0 @@
-import path from 'path';
-
-import glob from 'glob';
-import { nodeExternals } from 'rollup-plugin-node-externals';
-import { defineConfig } from 'vite';
-import dts from 'vite-plugin-dts';
-
-// https://vitejs.dev/config/
-export default defineConfig({
-  plugins: [
-    dts({
-      copyDtsFiles: true,
-    }),
-    {
-      ...nodeExternals({
-        devDeps: true,
-        builtinsPrefix: 'ignore',
-      }),
-      enforce: 'pre',
-    },
-  ],
-  build: {
-    outDir: 'dist',
-    sourcemap: true,
-    lib: {
-      entry: glob.sync(path.resolve(__dirname, 'src/**/*.ts'), {
-        ignore: '**/*.spec.ts',
-      }),
-      name: 'core-libs',
-      formats: ['es', 'cjs'],
-    },
-    rollupOptions: {
-      output: {
-        preserveModules: true,
-        preserveModulesRoot: 'src',
-      },
-    },
-  },
-});

+ 0 - 25
packages/markdown-splitter/vitest.config.ts

@@ -1,25 +0,0 @@
-import tsconfigPaths from 'vite-tsconfig-paths';
-import { defineConfig, coverageConfigDefaults } from 'vitest/config';
-
-export default defineConfig({
-  plugins: [
-    tsconfigPaths(),
-  ],
-  test: {
-    environment: 'node',
-    clearMocks: true,
-    globals: true,
-    coverage: {
-      exclude: [
-        ...coverageConfigDefaults.exclude,
-        'src/**/index.ts',
-      ],
-      thresholds: {
-        statements: 100,
-        branches: 100,
-        lines: 100,
-        functions: 100,
-      },
-    },
-  },
-});

+ 25 - 61
pnpm-lock.yaml

@@ -211,6 +211,9 @@ importers:
       '@browser-bunyan/console-formatted-stream':
         specifier: ^1.8.0
         version: 1.8.0
+      '@cspell/dynamic-import':
+        specifier: ^8.15.4
+        version: 8.15.4
       '@elastic/elasticsearch7':
         specifier: npm:@elastic/elasticsearch@^7.17.0
         version: '@elastic/elasticsearch@7.17.13'
@@ -394,6 +397,12 @@ importers:
       is-iso-date:
         specifier: ^0.0.1
         version: 0.0.1
+      js-tiktoken:
+        specifier: ^1.0.15
+        version: 1.0.15
+      js-yaml:
+        specifier: ^4.1.0
+        version: 4.1.0
       katex:
         specifier: ^0.16.11
         version: 0.16.11
@@ -616,6 +625,9 @@ importers:
       remark-rehype:
         specifier: ^11.1.1
         version: 11.1.1
+      remark-stringify:
+        specifier: ^11.0.0
+        version: 11.0.0
       remark-toc:
         specifier: ^9.0.0
         version: 9.0.0
@@ -698,9 +710,6 @@ importers:
       '@growi/editor':
         specifier: workspace:^
         version: link:../../packages/editor
-      '@growi/markdown-splitter':
-        specifier: workspace:^
-        version: link:../../packages/markdown-splitter
       '@growi/ui':
         specifier: workspace:^
         version: link:../../packages/ui
@@ -1186,52 +1195,6 @@ importers:
         specifier: ^13.6.19
         version: 13.6.19
 
-  packages/markdown-splitter:
-    dependencies:
-      js-tiktoken:
-        specifier: ^1.0.15
-        version: 1.0.15
-      js-yaml:
-        specifier: ^4.1.0
-        version: 4.1.0
-      react:
-        specifier: ^18.2.0
-        version: 18.2.0
-      react-dom:
-        specifier: ^18.2.0
-        version: 18.2.0(react@18.2.0)
-      remark-frontmatter:
-        specifier: ^5.0.0
-        version: 5.0.0
-      remark-gfm:
-        specifier: ^4.0.0
-        version: 4.0.0
-      remark-parse:
-        specifier: ^11.0.0
-        version: 11.0.0
-      remark-stringify:
-        specifier: ^11.0.0
-        version: 11.0.0
-      unified:
-        specifier: ^11.0.0
-        version: 11.0.5
-    devDependencies:
-      '@types/js-yaml':
-        specifier: ^4.0.9
-        version: 4.0.9
-      eslint-plugin-regex:
-        specifier: ^1.8.0
-        version: 1.10.0(eslint@8.41.0)
-      hast-util-sanitize:
-        specifier: ^4.1.0
-        version: 4.1.0
-      pako:
-        specifier: ^2.1.0
-        version: 2.1.0
-      throttle-debounce:
-        specifier: ^5.0.0
-        version: 5.0.2
-
   packages/pluginkit:
     dependencies:
       '@growi/core':
@@ -2467,6 +2430,10 @@ packages:
     resolution: {integrity: sha512-ooWCrlZP11i8GImSjTHYHLkvFDP48nS4+204nGb1RiX/WXYHmJA2III9/e2DWVabCESdW7hBAEzHRqUn9OUVvQ==}
     engines: {node: '>=0.1.90'}
 
+  '@cspell/dynamic-import@8.15.4':
+    resolution: {integrity: sha512-tr0F6EYN6qtniNvt1Uib+PgYQHeo4dQHXE2Optap+hYTOoQ2VoQ+SwBVjZ+Q2bmSAB0fmOyf0AvgsUtnWIpavw==}
+    engines: {node: '>=18.0'}
+
   '@cspotcode/source-map-support@0.8.1':
     resolution: {integrity: sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw==}
     engines: {node: '>=12'}
@@ -4045,9 +4012,6 @@ packages:
   '@types/jest@29.5.12':
     resolution: {integrity: sha512-eDC8bTvT/QhYdxJAulQikueigY5AsdBRH2yDKW3yveW7svY3+DzN84/2NUgkw10RTiJbWqZrTtoGVdYlvFJdLw==}
 
-  '@types/js-yaml@4.0.9':
-    resolution: {integrity: sha512-k4MGaQl5TGo/iipqb2UDG2UwjXziSWkh0uysQelTlJpX1qGlpUZYm8PnO4DxG1qBomtJUdYJ6qR6xdIah10JLg==}
-
   '@types/json-schema@7.0.11':
     resolution: {integrity: sha512-wOuvG1SN4Us4rez+tylwwwCV1psiNVOkJeM3AUWUNWg/jDQY2+HE/444y5gc+jBmRqASOm2Oeh5c1axHobwRKQ==}
 
@@ -7263,9 +7227,6 @@ packages:
   hast-util-raw@9.0.4:
     resolution: {integrity: sha512-LHE65TD2YiNsHD3YuXcKPHXPLuYh/gjp12mOfU8jxSrm1f/yJpsb0F/KKljS6U9LJoP0Ux+tCe8iJ2AsPzTdgA==}
 
-  hast-util-sanitize@4.1.0:
-    resolution: {integrity: sha512-Hd9tU0ltknMGRDv+d6Ro/4XKzBqQnP/EZrpiTbpFYfXv/uOhWeKc+2uajcbEvAEH98VZd7eII2PiXm13RihnLw==}
-
   hast-util-sanitize@5.0.1:
     resolution: {integrity: sha512-IGrgWLuip4O2nq5CugXy4GI2V8kx4sFVy5Hd4vF7AR2gxS0N9s7nEAVUyeMtZKZvzrxVsHt73XdTsno1tClIkQ==}
 
@@ -7481,6 +7442,9 @@ packages:
     engines: {node: '>=8'}
     hasBin: true
 
+  import-meta-resolve@4.1.0:
+    resolution: {integrity: sha512-I6fiaX09Xivtk+THaMfAwnA3MVA5Big1WHF1Dfx9hFuvNIWpXnorlkzhcQf6ehrqQiiZECRt1poOAkPmer3ruw==}
+
   imurmurhash@0.1.4:
     resolution: {integrity: sha512-JmXMZ6wuvDmLiHEml9ykzqO6lwFbof0GG4IkcGaENdCRDDmMVnny7s5HsIgHCbaq0w2MyPhDqkhTUgS2LU2PHA==}
     engines: {node: '>=0.8.19'}
@@ -14045,6 +14009,10 @@ snapshots:
   '@colors/colors@1.5.0':
     optional: true
 
+  '@cspell/dynamic-import@8.15.4':
+    dependencies:
+      import-meta-resolve: 4.1.0
+
   '@cspotcode/source-map-support@0.8.1':
     dependencies:
       '@jridgewell/trace-mapping': 0.3.9
@@ -16160,8 +16128,6 @@ snapshots:
       expect: 29.7.0
       pretty-format: 29.7.0
 
-  '@types/js-yaml@4.0.9': {}
-
   '@types/json-schema@7.0.11': {}
 
   '@types/json-schema@7.0.6': {}
@@ -19928,10 +19894,6 @@ snapshots:
       web-namespaces: 2.0.1
       zwitch: 2.0.2
 
-  hast-util-sanitize@4.1.0:
-    dependencies:
-      '@types/hast': 2.3.4
-
   hast-util-sanitize@5.0.1:
     dependencies:
       '@types/hast': 3.0.4
@@ -20224,6 +20186,8 @@ snapshots:
       pkg-dir: 4.2.0
       resolve-cwd: 3.0.0
 
+  import-meta-resolve@4.1.0: {}
+
   imurmurhash@0.1.4: {}
 
   indent-string@2.1.0: