2
0

index.spec.ts 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. // splitMarkdownIntoChunks.test.ts
  2. import type { Chunk } from '../src/services/markdown-splitter';
  3. import { splitMarkdownIntoChunks } from '../src/services/markdown-splitter';
  4. describe('splitMarkdownIntoChunks', () => {
  5. test('handles empty markdown string', () => {
  6. const markdown = '';
  7. const expected: Chunk[] = [];
  8. const result = splitMarkdownIntoChunks(markdown);
  9. expect(result).toEqual(expected);
  10. });
  11. test('handles markdown with only content and no headers', () => {
  12. const markdown = `This is some content without any headers.
  13. It spans multiple lines.
  14. Another paragraph.
  15. `;
  16. const expected: Chunk[] = [
  17. {
  18. label: '0-content',
  19. content: 'This is some content without any headers.\nIt spans multiple lines.\n\nAnother paragraph.',
  20. },
  21. ];
  22. const result = splitMarkdownIntoChunks(markdown);
  23. expect(result).toEqual(expected);
  24. });
  25. test('handles markdown starting with a header', () => {
  26. const markdown = `
  27. # Header 1
  28. Content under header 1.
  29. ## Header 1.1
  30. Content under header 1.1.
  31. # Header 2
  32. Content under header 2.
  33. `;
  34. const expected: Chunk[] = [
  35. { label: '1', content: '# Header 1' },
  36. { label: '1-content', content: 'Content under header 1.' },
  37. { label: '1-1', content: '## Header 1.1' },
  38. { label: '1-1-content', content: 'Content under header 1.1.' },
  39. { label: '2', content: '# Header 2' },
  40. { label: '2-content', content: 'Content under header 2.' },
  41. ];
  42. const result = splitMarkdownIntoChunks(markdown);
  43. expect(result).toEqual(expected);
  44. });
  45. test('handles markdown with non-consecutive heading levels', () => {
  46. const markdown = `
  47. Introduction without a header.
  48. # Chapter 1
  49. Content of chapter 1.
  50. ### Section 1.1.1
  51. Content of section 1.1.1.
  52. ## Section 1.2
  53. Content of section 1.2.
  54. # Chapter 2
  55. Content of chapter 2.
  56. ## Section 2.1
  57. Content of section 2.1.
  58. `;
  59. const expected: Chunk[] = [
  60. {
  61. label: '0-content',
  62. content: 'Introduction without a header.',
  63. },
  64. {
  65. label: '1',
  66. content: '# Chapter 1',
  67. },
  68. {
  69. label: '1-content',
  70. content: 'Content of chapter 1.',
  71. },
  72. {
  73. label: '1-1-1',
  74. content: '### Section 1.1.1',
  75. },
  76. {
  77. label: '1-1-1-content',
  78. content: 'Content of section 1.1.1.',
  79. },
  80. {
  81. label: '1-2',
  82. content: '## Section 1.2',
  83. },
  84. {
  85. label: '1-2-content',
  86. content: 'Content of section 1.2.',
  87. },
  88. {
  89. label: '2',
  90. content: '# Chapter 2',
  91. },
  92. {
  93. label: '2-content',
  94. content: 'Content of chapter 2.',
  95. },
  96. {
  97. label: '2-1',
  98. content: '## Section 2.1',
  99. },
  100. {
  101. label: '2-1-content',
  102. content: 'Content of section 2.1.',
  103. },
  104. ];
  105. const result = splitMarkdownIntoChunks(markdown);
  106. expect(result).toEqual(expected);
  107. });
  108. test('handles markdown with skipped heading levels', () => {
  109. const markdown = `
  110. # Header 1
  111. Content under header 1.
  112. #### Header 1.1.1.1
  113. Content under header 1.1.1.1.
  114. ## Header 1.2
  115. Content under header 1.2.
  116. # Header 2
  117. Content under header 2.
  118. `;
  119. const expected: Chunk[] = [
  120. { label: '1', content: '# Header 1' },
  121. { label: '1-content', content: 'Content under header 1.' },
  122. { label: '1-1-1-1', content: '#### Header 1.1.1.1' },
  123. { label: '1-1-1-1-content', content: 'Content under header 1.1.1.1.' },
  124. { label: '1-2', content: '## Header 1.2' },
  125. { label: '1-2-content', content: 'Content under header 1.2.' },
  126. { label: '2', content: '# Header 2' },
  127. { label: '2-content', content: 'Content under header 2.' },
  128. ];
  129. const result = splitMarkdownIntoChunks(markdown);
  130. expect(result).toEqual(expected);
  131. });
  132. test('handles malformed headings', () => {
  133. const markdown = `
  134. # Header 1
  135. Content under header 1.
  136. #### Header 1.1.1.1
  137. Content under header 1.1.1.1.
  138. `;
  139. const expected: Chunk[] = [
  140. { label: '1', content: '# Header 1' },
  141. { label: '1-content', content: 'Content under header 1.' },
  142. // Malformed heading '### ' is skipped or handled as content
  143. { label: '1-1-1-1', content: '#### Header 1.1.1.1' },
  144. { label: '1-1-1-1-content', content: 'Content under header 1.1.1.1.' },
  145. ];
  146. const result = splitMarkdownIntoChunks(markdown);
  147. expect(result).toEqual(expected);
  148. });
  149. test('handles multiple content blocks before any headers', () => {
  150. const markdown = `
  151. This is the first paragraph without a header.
  152. This is the second paragraph without a header.
  153. # Header 1
  154. Content under header 1.
  155. `;
  156. const expected: Chunk[] = [
  157. {
  158. label: '0-content',
  159. content: 'This is the first paragraph without a header.\n\nThis is the second paragraph without a header.',
  160. },
  161. { label: '1', content: '# Header 1' },
  162. { label: '1-content', content: 'Content under header 1.' },
  163. ];
  164. const result = splitMarkdownIntoChunks(markdown);
  165. expect(result).toEqual(expected);
  166. });
  167. test('handles markdown with only headers and no content', () => {
  168. const markdown = `
  169. # Header 1
  170. ## Header 1.1
  171. ### Header 1.1.1
  172. `;
  173. const expected: Chunk[] = [
  174. { label: '1', content: '# Header 1' },
  175. { label: '1-1', content: '## Header 1.1' },
  176. { label: '1-1-1', content: '### Header 1.1.1' },
  177. ];
  178. const result = splitMarkdownIntoChunks(markdown);
  179. expect(result).toEqual(expected);
  180. });
  181. test('handles markdown with mixed content and headers', () => {
  182. const markdown = `
  183. # Header 1
  184. Content under header 1.
  185. ## Header 1.1
  186. Content under header 1.1.
  187. Another piece of content.
  188. # Header 2
  189. Content under header 2.
  190. `;
  191. const expected: Chunk[] = [
  192. { label: '1', content: '# Header 1' },
  193. { label: '1-content', content: 'Content under header 1.' },
  194. { label: '1-1', content: '## Header 1.1' },
  195. { label: '1-1-content', content: 'Content under header 1.1.\nAnother piece of content.' },
  196. { label: '2', content: '# Header 2' },
  197. { label: '2-content', content: 'Content under header 2.' },
  198. ];
  199. const result = splitMarkdownIntoChunks(markdown);
  200. expect(result).toEqual(expected);
  201. });
  202. test('preserves list indentation and reduces unnecessary line breaks', () => {
  203. const markdown = `
  204. # Header 1
  205. Content under header 1.
  206. - Item 1
  207. - Subitem 1
  208. - Item 2
  209. # Header 2
  210. Content under header 2.
  211. `;
  212. const expected: Chunk[] = [
  213. { label: '1', content: '# Header 1' },
  214. { label: '1-content', content: 'Content under header 1.\n\n- Item 1\n - Subitem 1\n- Item 2' },
  215. { label: '2', content: '# Header 2' },
  216. { label: '2-content', content: 'Content under header 2.' },
  217. ];
  218. const result = splitMarkdownIntoChunks(markdown);
  219. expect(result).toEqual(expected);
  220. });
  221. });