Skip to content

Commit 56186b9

Browse files
authored
GH-41225: [C#] Slice value buffers when writing sliced list or binary arrays in IPC format (#41230)
### Rationale for this change This reduces file sizes when writing sliced binary or list arrays to IPC format. ### What changes are included in this PR? Changes `ArrowStreamWriter` to write only the subset of the values that is needed rather than the full value buffer when writing a `ListArray` or `BinaryArray`, and compute shifted value offset buffers. ### Are these changes tested? This code is covered by existing tests and the change doesn't introduce any difference in the observed array values, so I haven't added new tests or checks. I did change how list arrays are compared though as we can no longer compare the value and value offset buffers directly, so the tests now get list items as arrays and create a new `ArrayComparer` to compare them. This meant that array offsets are no longer always zero, so I've changed the offset assertions to only be used in strict mode. ### Are there any user-facing changes? Yes, this might reduce IPC file sizes for users writing sliced data. * GitHub Issue: #41225 Authored-by: Adam Reeve <adreeve@gmail.com> Signed-off-by: Curt Hagenlocher <curt@hagenlocher.org>
1 parent 3c37848 commit 56186b9

4 files changed

Lines changed: 122 additions & 30 deletions

File tree

csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs

Lines changed: 50 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -163,9 +163,18 @@ public void Visit(BooleanArray array)
163163
public void Visit(ListArray array)
164164
{
165165
_buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, array.Offset, array.Length));
166-
_buffers.Add(CreateSlicedBuffer<int>(array.ValueOffsetsBuffer, array.Offset, array.Length + 1));
166+
_buffers.Add(CreateBuffer(GetZeroBasedValueOffsets(array.ValueOffsetsBuffer, array.Offset, array.Length)));
167167

168-
VisitArray(array.Values);
168+
int valuesOffset = array.ValueOffsets[0];
169+
int valuesLength = array.ValueOffsets[array.Length] - valuesOffset;
170+
171+
var values = array.Values;
172+
if (valuesOffset > 0 || valuesLength < values.Length)
173+
{
174+
values = ArrowArrayFactory.Slice(values, valuesOffset, valuesLength);
175+
}
176+
177+
VisitArray(values);
169178
}
170179

171180
public void Visit(ListViewArray array)
@@ -195,8 +204,12 @@ public void Visit(FixedSizeListArray array)
195204
public void Visit(BinaryArray array)
196205
{
197206
_buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, array.Offset, array.Length));
198-
_buffers.Add(CreateSlicedBuffer<int>(array.ValueOffsetsBuffer, array.Offset, array.Length + 1));
199-
_buffers.Add(CreateBuffer(array.ValueBuffer));
207+
_buffers.Add(CreateBuffer(GetZeroBasedValueOffsets(array.ValueOffsetsBuffer, array.Offset, array.Length)));
208+
209+
int valuesOffset = array.ValueOffsets[0];
210+
int valuesLength = array.ValueOffsets[array.Length] - valuesOffset;
211+
212+
_buffers.Add(CreateSlicedBuffer<byte>(array.ValueBuffer, valuesOffset, valuesLength));
200213
}
201214

202215
public void Visit(BinaryViewArray array)
@@ -263,6 +276,39 @@ public void Visit(NullArray array)
263276
// There are no buffers for a NullArray
264277
}
265278

279+
private ArrowBuffer GetZeroBasedValueOffsets(ArrowBuffer valueOffsetsBuffer, int arrayOffset, int arrayLength)
280+
{
281+
var requiredBytes = CalculatePaddedBufferLength(sizeof(int) * (arrayLength + 1));
282+
283+
if (arrayOffset != 0)
284+
{
285+
// Array has been sliced, so we need to shift and adjust the offsets
286+
var originalOffsets = valueOffsetsBuffer.Span.CastTo<int>().Slice(arrayOffset, arrayLength + 1);
287+
var firstOffset = arrayLength > 0 ? originalOffsets[0] : 0;
288+
289+
var newValueOffsetsBuffer = _allocator.Allocate(requiredBytes);
290+
var newValueOffsets = newValueOffsetsBuffer.Memory.Span.CastTo<int>();
291+
292+
for (int i = 0; i < arrayLength + 1; ++i)
293+
{
294+
newValueOffsets[i] = originalOffsets[i] - firstOffset;
295+
}
296+
297+
return new ArrowBuffer(newValueOffsetsBuffer);
298+
}
299+
else if (valueOffsetsBuffer.Length > requiredBytes)
300+
{
301+
// Array may have been sliced but the offset is zero,
302+
// so we can truncate the existing offsets
303+
return new ArrowBuffer(valueOffsetsBuffer.Memory.Slice(0, requiredBytes));
304+
}
305+
else
306+
{
307+
// Use the full buffer
308+
return valueOffsetsBuffer;
309+
}
310+
}
311+
266312
private Buffer CreateBitmapBuffer(ArrowBuffer buffer, int offset, int length)
267313
{
268314
if (buffer.IsEmpty)

csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ public async Task WritesFooterAlignedMultipleOf8Async()
113113
[InlineData(0, 45)]
114114
[InlineData(3, 45)]
115115
[InlineData(16, 45)]
116+
[InlineData(10, 0)]
116117
public async Task WriteSlicedArrays(int sliceOffset, int sliceLength)
117118
{
118119
var originalBatch = TestData.CreateSampleRecordBatch(length: 100);

csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs

Lines changed: 35 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -160,10 +160,14 @@ public void Visit(StructArray array)
160160

161161
Assert.Equal(expectedArray.Length, array.Length);
162162
Assert.Equal(expectedArray.NullCount, array.NullCount);
163-
Assert.Equal(0, array.Offset);
164163
Assert.Equal(expectedArray.Data.Children.Length, array.Data.Children.Length);
165164
Assert.Equal(expectedArray.Fields.Count, array.Fields.Count);
166165

166+
if (_strictCompare)
167+
{
168+
Assert.Equal(expectedArray.Offset, array.Offset);
169+
}
170+
167171
for (int i = 0; i < array.Fields.Count; i++)
168172
{
169173
array.Fields[i].Accept(new ArrayComparer(expectedArray.Fields[i], _strictCompare));
@@ -178,12 +182,12 @@ public void Visit(UnionArray array)
178182
Assert.Equal(expectedArray.Mode, array.Mode);
179183
Assert.Equal(expectedArray.Length, array.Length);
180184
Assert.Equal(expectedArray.NullCount, array.NullCount);
181-
Assert.Equal(0, array.Offset);
182185
Assert.Equal(expectedArray.Data.Children.Length, array.Data.Children.Length);
183186
Assert.Equal(expectedArray.Fields.Count, array.Fields.Count);
184187

185188
if (_strictCompare)
186189
{
190+
Assert.Equal(expectedArray.Offset, array.Offset);
187191
Assert.True(expectedArray.TypeBuffer.Span.SequenceEqual(array.TypeBuffer.Span));
188192
}
189193
else
@@ -252,12 +256,12 @@ private void CompareBinaryArrays<T>(BinaryArray actualArray)
252256

253257
Assert.Equal(expectedArray.Length, actualArray.Length);
254258
Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
255-
Assert.Equal(0, actualArray.Offset);
256259

257260
CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer);
258261

259262
if (_strictCompare)
260263
{
264+
Assert.Equal(expectedArray.Offset, actualArray.Offset);
261265
Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span));
262266
Assert.True(expectedArray.Values.Slice(0, expectedArray.Length).SequenceEqual(actualArray.Values.Slice(0, actualArray.Length)));
263267
}
@@ -284,7 +288,11 @@ private void CompareVariadicArrays<T>(BinaryViewArray actualArray)
284288

285289
Assert.Equal(expectedArray.Length, actualArray.Length);
286290
Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
287-
Assert.Equal(0, actualArray.Offset);
291+
292+
if (_strictCompare)
293+
{
294+
Assert.Equal(expectedArray.Offset, actualArray.Offset);
295+
}
288296

289297
CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer);
290298

@@ -309,12 +317,12 @@ private void CompareArrays(FixedSizeBinaryArray actualArray)
309317

310318
Assert.Equal(expectedArray.Length, actualArray.Length);
311319
Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
312-
Assert.Equal(0, actualArray.Offset);
313320

314321
CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer);
315322

316323
if (_strictCompare)
317324
{
325+
Assert.Equal(expectedArray.Offset, actualArray.Offset);
318326
Assert.True(expectedArray.ValueBuffer.Span.Slice(0, expectedArray.Length).SequenceEqual(actualArray.ValueBuffer.Span.Slice(0, actualArray.Length)));
319327
}
320328
else
@@ -338,12 +346,12 @@ private void CompareArrays<T>(PrimitiveArray<T> actualArray)
338346

339347
Assert.Equal(expectedArray.Length, actualArray.Length);
340348
Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
341-
Assert.Equal(0, actualArray.Offset);
342349

343350
CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer);
344351

345352
if (_strictCompare)
346353
{
354+
Assert.Equal(expectedArray.Offset, actualArray.Offset);
347355
Assert.True(expectedArray.Values.Slice(0, expectedArray.Length).SequenceEqual(actualArray.Values.Slice(0, actualArray.Length)));
348356
}
349357
else
@@ -370,12 +378,12 @@ private void CompareArrays(BooleanArray actualArray)
370378

371379
Assert.Equal(expectedArray.Length, actualArray.Length);
372380
Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
373-
Assert.Equal(0, actualArray.Offset);
374381

375382
CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer);
376383

377384
if (_strictCompare)
378385
{
386+
Assert.Equal(expectedArray.Offset, actualArray.Offset);
379387
int booleanByteCount = BitUtility.ByteCount(expectedArray.Length);
380388
Assert.True(expectedArray.Values.Slice(0, booleanByteCount).SequenceEqual(actualArray.Values.Slice(0, booleanByteCount)));
381389
}
@@ -397,22 +405,31 @@ private void CompareArrays(ListArray actualArray)
397405

398406
Assert.Equal(expectedArray.Length, actualArray.Length);
399407
Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
400-
Assert.Equal(0, actualArray.Offset);
401408

402409
CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer);
403410

404411
if (_strictCompare)
405412
{
413+
Assert.Equal(expectedArray.Offset, actualArray.Offset);
406414
Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span));
415+
actualArray.Values.Accept(new ArrayComparer(expectedArray.Values, _strictCompare));
407416
}
408417
else
409418
{
410-
int offsetsStart = (expectedArray.Offset) * sizeof(int);
411-
int offsetsLength = (expectedArray.Length + 1) * sizeof(int);
412-
Assert.True(expectedArray.ValueOffsetsBuffer.Span.Slice(offsetsStart, offsetsLength).SequenceEqual(actualArray.ValueOffsetsBuffer.Span.Slice(0, offsetsLength)));
419+
for (int i = 0; i < actualArray.Length; ++i)
420+
{
421+
if (expectedArray.IsNull(i))
422+
{
423+
Assert.True(actualArray.IsNull(i));
424+
}
425+
else
426+
{
427+
var expectedList = expectedArray.GetSlicedValues(i);
428+
var actualList = actualArray.GetSlicedValues(i);
429+
actualList.Accept(new ArrayComparer(expectedList, _strictCompare));
430+
}
431+
}
413432
}
414-
415-
actualArray.Values.Accept(new ArrayComparer(expectedArray.Values, _strictCompare));
416433
}
417434

418435
private void CompareArrays(ListViewArray actualArray)
@@ -424,12 +441,12 @@ private void CompareArrays(ListViewArray actualArray)
424441

425442
Assert.Equal(expectedArray.Length, actualArray.Length);
426443
Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
427-
Assert.Equal(0, actualArray.Offset);
428444

429445
CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer);
430446

431447
if (_strictCompare)
432448
{
449+
Assert.Equal(expectedArray.Offset, actualArray.Offset);
433450
Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span));
434451
Assert.True(expectedArray.SizesBuffer.Span.SequenceEqual(actualArray.SizesBuffer.Span));
435452
}
@@ -453,7 +470,10 @@ private void CompareArrays(FixedSizeListArray actualArray)
453470

454471
Assert.Equal(expectedArray.Length, actualArray.Length);
455472
Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
456-
Assert.Equal(0, actualArray.Offset);
473+
if (_strictCompare)
474+
{
475+
Assert.Equal(expectedArray.Offset, actualArray.Offset);
476+
}
457477

458478
CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer);
459479

csharp/test/Apache.Arrow.Tests/TestData.cs

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,18 @@ public void Visit(StringType type)
294294

295295
for (var i = 0; i < Length; i++)
296296
{
297-
builder.Append(str);
297+
switch (i % 3)
298+
{
299+
case 0:
300+
builder.AppendNull();
301+
break;
302+
case 1:
303+
builder.Append(str);
304+
break;
305+
case 2:
306+
builder.Append(str + str);
307+
break;
308+
}
298309
}
299310

300311
Array = builder.Build();
@@ -328,15 +339,21 @@ public void Visit(ListType type)
328339
{
329340
var builder = new ListArray.Builder(type.ValueField).Reserve(Length);
330341

331-
var valueBuilder = (Int64Array.Builder)builder.ValueBuilder.Reserve(Length + 1);
342+
var valueBuilder = (Int64Array.Builder)builder.ValueBuilder.Reserve(Length * 3 / 2);
332343

333344
for (var i = 0; i < Length; i++)
334345
{
335-
builder.Append();
336-
valueBuilder.Append(i);
346+
if (i % 10 == 2)
347+
{
348+
builder.AppendNull();
349+
}
350+
else
351+
{
352+
builder.Append();
353+
var listLength = i % 4;
354+
valueBuilder.AppendRange(Enumerable.Range(i, listLength).Select(x => (long)x));
355+
}
337356
}
338-
//Add a value to check if Values.Length can exceed ListArray.Length
339-
valueBuilder.Append(0);
340357

341358
Array = builder.Build();
342359
}
@@ -352,8 +369,12 @@ public void Visit(ListViewType type)
352369
builder.Append();
353370
valueBuilder.Append(i);
354371
}
355-
//Add a value to check if Values.Length can exceed ListArray.Length
356-
valueBuilder.Append(0);
372+
373+
if (Length > 0)
374+
{
375+
// Add a value to check if Values.Length can exceed ListArray.Length
376+
valueBuilder.Append(0);
377+
}
357378

358379
Array = builder.Build();
359380
}
@@ -562,9 +583,13 @@ public void Visit(MapType type)
562583
keyBuilder.Append(i.ToString());
563584
valueBuilder.Append(i);
564585
}
565-
//Add a value to check if Values.Length can exceed MapArray.Length
566-
keyBuilder.Append("0");
567-
valueBuilder.Append(0);
586+
587+
if (Length > 0)
588+
{
589+
// Add a value to check if Values.Length can exceed MapArray.Length
590+
keyBuilder.Append("0");
591+
valueBuilder.Append(0);
592+
}
568593

569594
Array = builder.Build();
570595
}

0 commit comments

Comments
 (0)