Skip to content

Commit 9f8402a

Browse files
fix(collection): use UTF-8 byte length for SQL query size check (#3717)
1 parent 5b75ffd commit 9f8402a

File tree

2 files changed

+118
-5
lines changed

2 files changed

+118
-5
lines changed

src/utils/collection.ts

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -119,9 +119,28 @@ export const MAX_SQL_QUERY_SIZE = 100000
119119
/**
120120
* When we split a value in multiple SQL queries, we want to allow for a buffer
121121
* so if the rest of the query is a bit long, we will not hit the 100KB limit
122+
* NOTE: This is the byte limit, not the character limit
122123
*/
123124
export const SLICE_SIZE = 70000
124125

126+
const encoder = new TextEncoder()
127+
128+
/**
129+
* Calculate UTF-8 byte length of a string
130+
*/
131+
export function utf8ByteLength(str: string): number {
132+
return encoder.encode(str).byteLength
133+
}
134+
135+
/**
136+
* Return the largest character index such that str.slice(0, result) has UTF-8 byte length <= targetBytes
137+
*/
138+
export function charIndexAtByteOffset(str: string, targetBytes: number): number {
139+
const buf = new Uint8Array(targetBytes)
140+
const { read } = encoder.encodeInto(str, buf)
141+
return read
142+
}
143+
125144
// Convert collection data to SQL insert statement
126145
export function generateCollectionInsert(collection: ResolvedCollection, data: ParsedContentFile): { queries: string[], hash: string } {
127146
const fields: string[] = []
@@ -176,7 +195,7 @@ export function generateCollectionInsert(collection: ResolvedCollection, data: P
176195
const sql = `INSERT INTO ${collection.tableName} VALUES (${'?, '.repeat(values.length).slice(0, -2)});`
177196
.replace(/\?/g, () => values[index++] as string)
178197

179-
if (sql.length < MAX_SQL_QUERY_SIZE) {
198+
if (utf8ByteLength(sql) < MAX_SQL_QUERY_SIZE) {
180199
return {
181200
queries: [sql],
182201
hash: valuesHash,
@@ -186,7 +205,7 @@ export function generateCollectionInsert(collection: ResolvedCollection, data: P
186205
// Split the SQL into multiple statements:
187206
// Take the biggest column to insert (usually body) and split the column in multiple strings
188207
// first we insert the row in the database, then we update it with the rest of the string by concatenation
189-
const biggestColumn = [...values].sort((a, b) => String(b).length - String(a).length)[0]
208+
const biggestColumn = [...values].sort((a, b) => utf8ByteLength(String(b)) - utf8ByteLength(String(a)))[0]
190209
const bigColumnIndex = values.indexOf(biggestColumn!)
191210
const bigColumnName = fields[bigColumnIndex]
192211

@@ -199,7 +218,7 @@ export function generateCollectionInsert(collection: ResolvedCollection, data: P
199218
}
200219

201220
if (typeof biggestColumn === 'string') {
202-
let sliceIndex = getSliceIndex(biggestColumn, SLICE_SIZE)
221+
let sliceIndex = getSliceIndex(biggestColumn, charIndexAtByteOffset(biggestColumn, SLICE_SIZE))
203222

204223
values[bigColumnIndex] = `${biggestColumn.slice(0, sliceIndex)}'`
205224
index = 0
@@ -211,7 +230,8 @@ export function generateCollectionInsert(collection: ResolvedCollection, data: P
211230
]
212231
while (sliceIndex < biggestColumn.length) {
213232
const prevSliceIndex = sliceIndex
214-
sliceIndex = getSliceIndex(biggestColumn, sliceIndex + SLICE_SIZE)
233+
const rawIndex = charIndexAtByteOffset(biggestColumn.slice(sliceIndex), SLICE_SIZE) + sliceIndex
234+
sliceIndex = rawIndex >= biggestColumn.length ? biggestColumn.length + 1 : getSliceIndex(biggestColumn, rawIndex)
215235

216236
const isLastSlice = sliceIndex > biggestColumn.length
217237
const newSlice = `'${biggestColumn.slice(prevSliceIndex, sliceIndex)}` + (!isLastSlice ? '\'' : '')

test/unit/generateCollectionInsert.test.ts

Lines changed: 94 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { beforeAll, describe, expect, test } from 'vitest'
22
import { z } from 'zod'
3-
import { generateCollectionInsert, defineCollection, resolveCollection, getTableName, SLICE_SIZE, MAX_SQL_QUERY_SIZE } from '../../src/utils/collection'
3+
import { generateCollectionInsert, defineCollection, resolveCollection, getTableName, SLICE_SIZE, MAX_SQL_QUERY_SIZE, utf8ByteLength } from '../../src/utils/collection'
44
import { initiateValidatorsContext } from '../../src/utils/dependencies'
55

66
describe('generateCollectionInsert', () => {
@@ -123,4 +123,97 @@ describe('generateCollectionInsert', () => {
123123
index++
124124
}
125125
})
126+
127+
test('Split multibyte (UTF-8) values that exceed byte limit', () => {
128+
const collection = resolveCollection('content', defineCollection({
129+
type: 'data',
130+
source: '**',
131+
schema: z.object({
132+
content: z.string(),
133+
}),
134+
}))!
135+
136+
// '心' is 3 bytes in UTF-8. 35000 chars = 105000 bytes > MAX_SQL_QUERY_SIZE (100000)
137+
const content = '心'.repeat(35000)
138+
139+
const { queries: sql } = generateCollectionInsert(collection, {
140+
id: 'multibyte.md',
141+
stem: 'multibyte',
142+
extension: 'md',
143+
meta: {},
144+
content,
145+
})
146+
147+
// Must be split into multiple queries
148+
expect(sql.length).toBeGreaterThan(1)
149+
150+
// Each query must fit within the byte limit
151+
for (const query of sql) {
152+
expect(utf8ByteLength(query)).toBeLessThan(MAX_SQL_QUERY_SIZE)
153+
}
154+
155+
// First query should be INSERT, subsequent should be UPDATE
156+
expect(sql[0]).toContain('INSERT INTO')
157+
for (let i = 1; i < sql.length; i++) {
158+
expect(sql[i]).toContain('UPDATE')
159+
}
160+
161+
// Reconstruct the content from all queries and verify it matches the original
162+
const insertMatch = sql[0]!.match(/'(+)'/)
163+
let reconstructed = insertMatch![1]!
164+
for (let i = 1; i < sql.length; i++) {
165+
const updateMatch = sql[i]!.match(/CONCAT\(content, '(+)'\)/)
166+
reconstructed += updateMatch![1]!
167+
}
168+
expect(reconstructed).toBe(content)
169+
})
170+
171+
test('Succeed when SLICE_SIZE byte boundary falls on an emoji', () => {
172+
const collection = resolveCollection('content', defineCollection({
173+
type: 'data',
174+
source: '**',
175+
schema: z.object({
176+
content: z.string(),
177+
}),
178+
}))!
179+
180+
// 'a' (1 byte) shifts alignment so that the SLICE_SIZE byte boundary
181+
// falls in the middle of a '😀' (4 bytes in UTF-8)
182+
// biggestColumn = "'a😀😀...😀'" → byte 0: quote(1), byte 1: 'a'(1), bytes 2+: emojis(4 each)
183+
// Byte at SLICE_SIZE (70000) = 2 + 4*17499.5 → falls inside the 17500th emoji
184+
const content = 'a' + '😀'.repeat(25000)
185+
186+
const { queries: sql } = generateCollectionInsert(collection, {
187+
id: 'emoji-boundary.md',
188+
stem: 'emoji-boundary',
189+
extension: 'md',
190+
meta: {},
191+
content,
192+
})
193+
194+
// Must be split into multiple queries
195+
expect(sql.length).toBeGreaterThan(1)
196+
197+
// Each query must fit within the byte limit
198+
for (const query of sql) {
199+
expect(utf8ByteLength(query)).toBeLessThan(MAX_SQL_QUERY_SIZE)
200+
}
201+
202+
// First query should be INSERT, subsequent should be UPDATE
203+
expect(sql[0]).toContain('INSERT INTO')
204+
for (let i = 1; i < sql.length; i++) {
205+
expect(sql[i]).toContain('UPDATE')
206+
}
207+
208+
// Reconstruct the content from all queries and verify no emoji was split
209+
const insertMatch = sql[0]!.match(/'(a(?:😀)+)'/)
210+
expect(insertMatch).not.toBeNull()
211+
let reconstructed = insertMatch![1]!
212+
for (let i = 1; i < sql.length; i++) {
213+
const updateMatch = sql[i]!.match(/CONCAT\(content, '((?:😀)+)'\)/)
214+
expect(updateMatch).not.toBeNull()
215+
reconstructed += updateMatch![1]!
216+
}
217+
expect(reconstructed).toBe(content)
218+
})
126219
})

0 commit comments

Comments
 (0)