Skip to content

Commit 6dac3d1

Browse files
authored
✨ feat(utils): added trimBasedOnBatchProbe for truncating without compromising structured data (#11836)
1 parent ae3b6fd commit 6dac3d1

5 files changed

Lines changed: 453 additions & 0 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
export * from './trimBatchProbe/trimBatchProbe'
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
export type {
2+
Buildable,
3+
Input,
4+
Joiner,
5+
TrimBatchProbeOptions
6+
} from './trimBatchProbe'
7+
export { trimBasedOnBatchProbe } from './trimBatchProbe'
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
import { describe, expect, it, vi } from 'vitest';
2+
3+
import { resolveJoiner, normalizeToArray, truncateByPunctuation, hardTruncateFromTail, trimBasedOnBatchProbe } from './trimBatchProbe';
4+
5+
vi.mock('tokenx', () => ({
6+
estimateTokenCount: (str: string) => str.split(/\s+/).filter(Boolean).length,
7+
}));
8+
9+
describe('trimBasedOnBatchProbe', () => {
10+
it('prefers compact builds to keep more segments', async () => {
11+
class BuildableChunk {
12+
constructor(private readonly detail: string, private readonly summary: string) {}
13+
build(tryCompactIfPossible?: boolean) {
14+
return tryCompactIfPossible ? this.summary : this.detail;
15+
}
16+
}
17+
18+
const chunks = [
19+
new BuildableChunk('very old detail segment', 'old summary'),
20+
new BuildableChunk('middle detail segment', 'mid summary'),
21+
new BuildableChunk('latest detail segment', 'latest summary'),
22+
];
23+
24+
const result = await trimBasedOnBatchProbe(chunks, 6);
25+
26+
expect(result).toBe('old summary\nmid summary\nlatest summary');
27+
});
28+
29+
it('keeps newest plain string chunks when under limit', async () => {
30+
const chunks = ['older message', 'newer message'];
31+
const result = await trimBasedOnBatchProbe(chunks, 10);
32+
33+
expect(result).toBe('older message\nnewer message');
34+
});
35+
36+
it('prefers compact probe when it allows keeping more segments', async () => {
37+
class BuildableChunk {
38+
constructor(private readonly detail: string, private readonly summary: string) {}
39+
build(tryCompactIfPossible?: boolean) {
40+
return tryCompactIfPossible ? this.summary : this.detail;
41+
}
42+
}
43+
44+
// Each detail is 3 tokens, summary is 1 token.
45+
const chunks = [
46+
new BuildableChunk('old detail chunk', 'old'),
47+
new BuildableChunk('mid detail chunk', 'mid'),
48+
new BuildableChunk('latest detail chunk', 'latest'),
49+
];
50+
51+
// With detail: 9 tokens total => only last 2 fit; with summary: 3 tokens => all 3 fit.
52+
const result = await trimBasedOnBatchProbe(chunks, 5);
53+
54+
expect(result).toBe('old\nmid\nlatest');
55+
});
56+
57+
it('truncates only the newest structured segment when all probes fail', async () => {
58+
const structured = [
59+
'<root><a>keep me</a></root>',
60+
'<root><b>keep me too</b></root>',
61+
'<root><c>truncate me last</c></root>',
62+
];
63+
64+
const result = await trimBasedOnBatchProbe(structured, 2, { tryChunkingByPunctuation: true });
65+
66+
// When the limit is extremely small, we still return a truncated newest segment rather than empty.
67+
expect(result.length).toBeGreaterThan(0);
68+
});
69+
70+
it('returns empty when truncation is disabled and nothing fits', async () => {
71+
const result = await trimBasedOnBatchProbe('too long without truncation', {
72+
tokenLimit: 1,
73+
tryChunkingByPunctuation: false,
74+
tryHardTruncation: false,
75+
});
76+
77+
expect(result).toBe('');
78+
});
79+
80+
it('uses compact build for single buildable before truncation', async () => {
81+
class BuildableChunk {
82+
constructor(private readonly detail: string, private readonly summary: string) {}
83+
build(tryCompactIfPossible?: boolean) {
84+
return tryCompactIfPossible ? this.summary : this.detail;
85+
}
86+
}
87+
88+
const result = await trimBasedOnBatchProbe(new BuildableChunk('too long detail text', 'short'), 2);
89+
90+
expect(result).toBe('short');
91+
});
92+
93+
it('selects the largest newest batch within limit via probing', async () => {
94+
// 5 segments of two tokens each => total 10; limit 5 should keep last 3 segments (6 tokens -> too many), so keep last 2 segments (4 tokens) under limit.
95+
const segments = ['a b', 'c d', 'e f', 'g h', 'i j'];
96+
const result = await trimBasedOnBatchProbe(segments, 5);
97+
98+
expect(result).toBe('g h\ni j');
99+
});
100+
101+
it('falls back to punctuation then hard truncation for single strings', async () => {
102+
const text =
103+
'Older sentence should be dropped. Newest sentence should stay intact. trailing tail';
104+
105+
const result = await trimBasedOnBatchProbe(text, 4);
106+
107+
expect(result).toBe('trailing tail');
108+
109+
const longToken = 'thisisaverylongtokenwithoutspacesorpunctuationthatkeepsgoing';
110+
const hardResult = await trimBasedOnBatchProbe(longToken, 3);
111+
112+
expect(hardResult.length).toBeGreaterThan(0);
113+
expect(longToken.endsWith(hardResult)).toBe(true);
114+
});
115+
116+
describe('__private helpers', () => {
117+
it('resolves joiners correctly', async () => {
118+
const fnJoiner = (batch: string[]) => batch.join('|');
119+
const objJoiner = { join: (batch: string[]) => batch.join('*') };
120+
121+
expect(resolveJoiner()).toBe('\n');
122+
expect(resolveJoiner(',') as string).toBe(',');
123+
expect(await (resolveJoiner(fnJoiner) as any)(['a', 'b'])).toBe('a|b');
124+
expect(await (resolveJoiner(objJoiner) as any)(['a', 'b'])).toBe('a*b');
125+
});
126+
127+
it('normalizes inputs to array', () => {
128+
expect(normalizeToArray(null)).toEqual([]);
129+
expect(normalizeToArray('one')).toEqual(['one']);
130+
expect(normalizeToArray(['a', 'b'])).toEqual(['a', 'b']);
131+
});
132+
133+
it('truncates by punctuation when possible', async () => {
134+
const text = 'keep this sentence. drop these words';
135+
const result = await truncateByPunctuation(text, 3);
136+
137+
expect(result).toBe('drop these words');
138+
});
139+
140+
it('hard truncates from tail with shrinking window', async () => {
141+
const text = 'averylongtokenwithoutspacesorpunctuation';
142+
const result = await hardTruncateFromTail(text, 2);
143+
144+
expect(result.length).toBeGreaterThan(0);
145+
expect(text.endsWith(result)).toBe(true);
146+
});
147+
});
148+
});

0 commit comments

Comments
 (0)