|
1 | 1 | import { beforeAll, describe, expect, test } from 'vitest' |
2 | 2 | import { z } from 'zod' |
3 | | -import { generateCollectionInsert, defineCollection, resolveCollection, getTableName, SLICE_SIZE, MAX_SQL_QUERY_SIZE } from '../../src/utils/collection' |
| 3 | +import { generateCollectionInsert, defineCollection, resolveCollection, getTableName, SLICE_SIZE, MAX_SQL_QUERY_SIZE, utf8ByteLength } from '../../src/utils/collection' |
4 | 4 | import { initiateValidatorsContext } from '../../src/utils/dependencies' |
5 | 5 |
|
6 | 6 | describe('generateCollectionInsert', () => { |
@@ -123,4 +123,97 @@ describe('generateCollectionInsert', () => { |
123 | 123 | index++ |
124 | 124 | } |
125 | 125 | }) |
| 126 | + |
| 127 | + test('Split multibyte (UTF-8) values that exceed byte limit', () => { |
| 128 | + const collection = resolveCollection('content', defineCollection({ |
| 129 | + type: 'data', |
| 130 | + source: '**', |
| 131 | + schema: z.object({ |
| 132 | + content: z.string(), |
| 133 | + }), |
| 134 | + }))! |
| 135 | + |
| 136 | + // '心' is 3 bytes in UTF-8. 35000 chars = 105000 bytes > MAX_SQL_QUERY_SIZE (100000) |
| 137 | + const content = '心'.repeat(35000) |
| 138 | + |
| 139 | + const { queries: sql } = generateCollectionInsert(collection, { |
| 140 | + id: 'multibyte.md', |
| 141 | + stem: 'multibyte', |
| 142 | + extension: 'md', |
| 143 | + meta: {}, |
| 144 | + content, |
| 145 | + }) |
| 146 | + |
| 147 | + // Must be split into multiple queries |
| 148 | + expect(sql.length).toBeGreaterThan(1) |
| 149 | + |
| 150 | + // Each query must fit within the byte limit |
| 151 | + for (const query of sql) { |
| 152 | + expect(utf8ByteLength(query)).toBeLessThan(MAX_SQL_QUERY_SIZE) |
| 153 | + } |
| 154 | + |
| 155 | + // First query should be INSERT, subsequent should be UPDATE |
| 156 | + expect(sql[0]).toContain('INSERT INTO') |
| 157 | + for (let i = 1; i < sql.length; i++) { |
| 158 | + expect(sql[i]).toContain('UPDATE') |
| 159 | + } |
| 160 | + |
| 161 | + // Reconstruct the content from all queries and verify it matches the original |
| 162 | + const insertMatch = sql[0]!.match(/'(心+)'/) |
| 163 | + let reconstructed = insertMatch![1]! |
| 164 | + for (let i = 1; i < sql.length; i++) { |
| 165 | + const updateMatch = sql[i]!.match(/CONCAT\(content, '(心+)'\)/) |
| 166 | + reconstructed += updateMatch![1]! |
| 167 | + } |
| 168 | + expect(reconstructed).toBe(content) |
| 169 | + }) |
| 170 | + |
| 171 | + test('Succeed when SLICE_SIZE byte boundary falls on an emoji', () => { |
| 172 | + const collection = resolveCollection('content', defineCollection({ |
| 173 | + type: 'data', |
| 174 | + source: '**', |
| 175 | + schema: z.object({ |
| 176 | + content: z.string(), |
| 177 | + }), |
| 178 | + }))! |
| 179 | + |
| 180 | + // 'a' (1 byte) shifts alignment so that the SLICE_SIZE byte boundary |
| 181 | + // falls in the middle of a '😀' (4 bytes in UTF-8) |
| 182 | + // biggestColumn = "'a😀😀...😀'" → byte 0: quote(1), byte 1: 'a'(1), bytes 2+: emojis(4 each) |
| 183 | + // Byte at SLICE_SIZE (70000) = 2 + 4*17499.5 → falls inside the 17500th emoji |
| 184 | + const content = 'a' + '😀'.repeat(25000) |
| 185 | + |
| 186 | + const { queries: sql } = generateCollectionInsert(collection, { |
| 187 | + id: 'emoji-boundary.md', |
| 188 | + stem: 'emoji-boundary', |
| 189 | + extension: 'md', |
| 190 | + meta: {}, |
| 191 | + content, |
| 192 | + }) |
| 193 | + |
| 194 | + // Must be split into multiple queries |
| 195 | + expect(sql.length).toBeGreaterThan(1) |
| 196 | + |
| 197 | + // Each query must fit within the byte limit |
| 198 | + for (const query of sql) { |
| 199 | + expect(utf8ByteLength(query)).toBeLessThan(MAX_SQL_QUERY_SIZE) |
| 200 | + } |
| 201 | + |
| 202 | + // First query should be INSERT, subsequent should be UPDATE |
| 203 | + expect(sql[0]).toContain('INSERT INTO') |
| 204 | + for (let i = 1; i < sql.length; i++) { |
| 205 | + expect(sql[i]).toContain('UPDATE') |
| 206 | + } |
| 207 | + |
| 208 | + // Reconstruct the content from all queries and verify no emoji was split |
| 209 | + const insertMatch = sql[0]!.match(/'(a(?:😀)+)'/) |
| 210 | + expect(insertMatch).not.toBeNull() |
| 211 | + let reconstructed = insertMatch![1]! |
| 212 | + for (let i = 1; i < sql.length; i++) { |
| 213 | + const updateMatch = sql[i]!.match(/CONCAT\(content, '((?:😀)+)'\)/) |
| 214 | + expect(updateMatch).not.toBeNull() |
| 215 | + reconstructed += updateMatch![1]! |
| 216 | + } |
| 217 | + expect(reconstructed).toBe(content) |
| 218 | + }) |
126 | 219 | }) |
0 commit comments