Skip to content

Commit 9ba789d

Browse files
GH-43267: [C#] Correctly import sliced arrays through the C Data interface (#44117)
### What changes are included in this PR? Changes to the C Data importer to correctly handle nonzero offsets. ### Are these changes tested? Yes ### Are there any user-facing changes? No Closes #43267 * GitHub Issue: #43267 Authored-by: Curt Hagenlocher <curt@hagenlocher.org> Signed-off-by: Curt Hagenlocher <curt@hagenlocher.org>
1 parent a5d40d0 commit 9ba789d

7 files changed

Lines changed: 88 additions & 18 deletions

File tree

csharp/src/Apache.Arrow/Apache.Arrow.csproj

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,16 @@
77
<Description>Apache Arrow is a cross-language development platform for in-memory data. It specifies a standardized language-independent columnar memory format for flat and hierarchical data, organized for efficient analytic operations on modern hardware.</Description>
88
</PropertyGroup>
99

10-
<PropertyGroup Condition="'$(IsWindows)'=='true'">
10+
<PropertyGroup>
1111
<TargetFrameworks>netstandard2.0;net6.0;net8.0;net462</TargetFrameworks>
1212
</PropertyGroup>
13-
<PropertyGroup Condition="'$(IsWindows)'!='true'">
14-
<TargetFrameworks>netstandard2.0;net6.0;net8.0</TargetFrameworks>
15-
</PropertyGroup>
1613

1714
<ItemGroup Condition="'$(TargetFrameworkIdentifier)' == '.NETStandard' or '$(TargetFramework)' == 'net462'">
1815
<PackageReference Include="System.Buffers" Version="4.5.1" />
1916
<PackageReference Include="System.Memory" Version="4.5.5" />
2017
<PackageReference Include="System.Runtime.CompilerServices.Unsafe" Version="4.7.1" />
2118
<PackageReference Include="System.Threading.Tasks.Extensions" Version="4.5.4" />
19+
<PackageReference Include="System.ValueTuple" Version="4.5.0" />
2220
</ItemGroup>
2321

2422
<ItemGroup>

csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ private ArrayData[] ProcessStructChildren(CArrowArray* cArray, IReadOnlyList<Fie
260260

261261
private ArrowBuffer ImportValidityBuffer(CArrowArray* cArray)
262262
{
263-
int length = checked((int)cArray->length);
263+
int length = checked((int)cArray->offset + (int)cArray->length);
264264
int validityLength = checked((int)BitUtility.RoundUpToMultipleOf8(length) / 8);
265265
return (cArray->buffers[0] == null) ? ArrowBuffer.Empty : new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[0], 0, validityLength));
266266
}
@@ -285,7 +285,7 @@ private ArrowBuffer[] ImportByteArrayBuffers(CArrowArray* cArray)
285285
throw new InvalidOperationException("Byte arrays are expected to have exactly three buffers");
286286
}
287287

288-
int length = checked((int)cArray->length);
288+
int length = checked((int)cArray->offset + (int)cArray->length);
289289
int offsetsLength = (length + 1) * 4;
290290
int* offsets = (int*)cArray->buffers[1];
291291
Debug.Assert(offsets != null);
@@ -306,7 +306,7 @@ private ArrowBuffer[] ImportByteArrayViewBuffers(CArrowArray* cArray)
306306
throw new InvalidOperationException("Byte array views are expected to have at least three buffers");
307307
}
308308

309-
int length = checked((int)cArray->length);
309+
int length = checked((int)cArray->offset + (int)cArray->length);
310310
int viewsLength = length * 16;
311311

312312
long* bufferLengths = (long*)cArray->buffers[cArray->n_buffers - 1];
@@ -336,7 +336,7 @@ private ArrowBuffer[] ImportLargeByteArrayBuffers(CArrowArray* cArray)
336336
$"is greater than the maximum supported large byte array length ({maxLength})");
337337
}
338338

339-
int length = (int)cArray->length;
339+
int length = checked((int)cArray->offset + (int)cArray->length);
340340
int offsetsLength = (length + 1) * 8;
341341
long* offsets = (long*)cArray->buffers[1];
342342
Debug.Assert(offsets != null);
@@ -364,7 +364,7 @@ private ArrowBuffer[] ImportListBuffers(CArrowArray* cArray)
364364
throw new InvalidOperationException("List arrays are expected to have exactly two buffers");
365365
}
366366

367-
int length = checked((int)cArray->length);
367+
int length = checked((int)cArray->offset + (int)cArray->length);
368368
int offsetsLength = (length + 1) * 4;
369369

370370
ArrowBuffer[] buffers = new ArrowBuffer[2];
@@ -381,7 +381,7 @@ private ArrowBuffer[] ImportListViewBuffers(CArrowArray* cArray)
381381
throw new InvalidOperationException("List view arrays are expected to have exactly three buffers");
382382
}
383383

384-
int length = checked((int)cArray->length);
384+
int length = checked((int)cArray->offset + (int)cArray->length);
385385
int offsetsLength = length * 4;
386386

387387
ArrowBuffer[] buffers = new ArrowBuffer[3];
@@ -407,7 +407,7 @@ private ArrowBuffer[] ImportLargeListBuffers(CArrowArray* cArray)
407407
$"is greater than the maximum supported large list array length ({maxLength})");
408408
}
409409

410-
int length = (int)cArray->length;
410+
int length = checked((int)cArray->offset + (int)cArray->length);
411411
int offsetsLength = (length + 1) * 8;
412412

413413
ArrowBuffer[] buffers = new ArrowBuffer[2];
@@ -436,7 +436,7 @@ private ArrowBuffer[] ImportDenseUnionBuffers(CArrowArray* cArray)
436436
{
437437
throw new InvalidOperationException("Dense union arrays are expected to have exactly two children");
438438
}
439-
int length = checked((int)cArray->length);
439+
int length = checked((int)cArray->offset + (int)cArray->length);
440440
int offsetsLength = length * 4;
441441

442442
ArrowBuffer[] buffers = new ArrowBuffer[2];
@@ -454,7 +454,7 @@ private ArrowBuffer[] ImportSparseUnionBuffers(CArrowArray* cArray)
454454
}
455455

456456
ArrowBuffer[] buffers = new ArrowBuffer[1];
457-
buffers[0] = ImportCArrayBuffer(cArray, 0, checked((int)cArray->length));
457+
buffers[0] = ImportCArrayBuffer(cArray, 0, checked((int)cArray->offset + (int)cArray->length));
458458

459459
return buffers;
460460
}
@@ -467,10 +467,10 @@ private ArrowBuffer[] ImportFixedWidthBuffers(CArrowArray* cArray, int bitWidth)
467467
}
468468

469469
// validity, data
470-
int length = checked((int)cArray->length);
470+
int length = checked((int)cArray->offset + (int)cArray->length);
471471
int valuesLength;
472472
if (bitWidth >= 8)
473-
valuesLength = checked((int)(cArray->length * bitWidth / 8));
473+
valuesLength = checked(length * bitWidth / 8);
474474
else
475475
valuesLength = checked((int)BitUtility.RoundUpToMultipleOf8(length) / 8);
476476

csharp/src/Apache.Arrow/RecordBatch.cs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,17 @@ public RecordBatch Clone(MemoryAllocator allocator = default)
100100
return new RecordBatch(Schema, arrays, Length);
101101
}
102102

103+
public RecordBatch Slice(int offset, int length)
104+
{
105+
if (offset > Length)
106+
{
107+
throw new ArgumentException($"Offset {offset} cannot be greater than Length {Length} for RecordBatch.Slice");
108+
}
109+
110+
length = Math.Min(Length - offset, length);
111+
return new RecordBatch(Schema, _arrays.Select(a => ArrowArrayFactory.Slice(a, offset, length)), length);
112+
}
113+
103114
public void Accept(IArrowArrayVisitor visitor)
104115
{
105116
switch (visitor)

csharp/src/Apache.Arrow/Utility.cs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,8 @@
1313
// See the License for the specific language governing permissions and
1414
// limitations under the License.
1515

16-
using Apache.Arrow.Flatbuf;
1716
using System;
1817
using System.Collections.Generic;
19-
using System.Text;
2018

2119
namespace Apache.Arrow
2220
{

csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -566,7 +566,9 @@ private void CompareArrays(FixedSizeListArray actualArray)
566566
var listSize = ((FixedSizeListType)expectedArray.Data.DataType).ListSize;
567567
var expectedValuesSlice = ArrowArrayFactory.Slice(
568568
expectedArray.Values, expectedArray.Offset * listSize, expectedArray.Length * listSize);
569-
actualArray.Values.Accept(new ArrayComparer(expectedValuesSlice, _strictCompare));
569+
var actualValuesSlice = ArrowArrayFactory.Slice(
570+
actualArray.Values, actualArray.Offset * listSize, actualArray.Length * listSize);
571+
actualValuesSlice.Accept(new ArrayComparer(expectedValuesSlice, _strictCompare));
570572
}
571573

572574
private void CompareValidityBuffer(int nullCount, int arrayLength, ArrowBuffer expectedValidityBuffer, int expectedBufferOffset, ArrowBuffer actualValidityBuffer, int actualBufferOffset)

csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,5 +92,23 @@ public unsafe void CallsReleaseForInvalid()
9292
GC.KeepAlive(releaseCallback);
9393
}
9494
#endif
95+
96+
[Fact]
97+
public unsafe void RoundTripInt32ArrayWithOffset()
98+
{
99+
Int32Array array = new Int32Array.Builder()
100+
.AppendRange(new[] { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 })
101+
.Build();
102+
IArrowArray sliced = array.Slice(2, 6);
103+
CArrowArray* cArray = CArrowArray.Create();
104+
CArrowArrayExporter.ExportArray(sliced, cArray);
105+
using (var importedSlice = (Int32Array)CArrowArrayImporter.ImportArray(cArray, array.Data.DataType))
106+
{
107+
Assert.Equal(6, importedSlice.Length);
108+
Assert.Equal(2, importedSlice.Offset);
109+
Assert.Equal(2, importedSlice.GetValue(0));
110+
}
111+
CArrowArray.Free(cArray);
112+
}
95113
}
96114
}

csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -792,6 +792,49 @@ public unsafe void RoundTripTestBatch()
792792
CArrowSchema.Free(cImportSchema);
793793
}
794794

795+
[SkippableFact]
796+
public unsafe void RoundTripTestSlicedBatch()
797+
{
798+
// TODO: Enable these once this the version of pyarrow referenced during testing supports them
799+
HashSet<ArrowTypeId> unsupported = new HashSet<ArrowTypeId> { ArrowTypeId.ListView, ArrowTypeId.BinaryView, ArrowTypeId.StringView };
800+
RecordBatch batch1 = TestData.CreateSampleRecordBatch(4, excludedTypes: unsupported);
801+
RecordBatch batch1slice = batch1.Slice(1, 2);
802+
RecordBatch batch2 = batch1slice.Clone();
803+
804+
CArrowArray* cExportArray = CArrowArray.Create();
805+
CArrowArrayExporter.ExportRecordBatch(batch1slice, cExportArray);
806+
807+
CArrowSchema* cExportSchema = CArrowSchema.Create();
808+
CArrowSchemaExporter.ExportSchema(batch1.Schema, cExportSchema);
809+
810+
CArrowArray* cImportArray = CArrowArray.Create();
811+
CArrowSchema* cImportSchema = CArrowSchema.Create();
812+
813+
// For Python, we need to provide the pointers
814+
long exportArrayPtr = ((IntPtr)cExportArray).ToInt64();
815+
long exportSchemaPtr = ((IntPtr)cExportSchema).ToInt64();
816+
long importArrayPtr = ((IntPtr)cImportArray).ToInt64();
817+
long importSchemaPtr = ((IntPtr)cImportSchema).ToInt64();
818+
819+
using (Py.GIL())
820+
{
821+
dynamic pa = Py.Import("pyarrow");
822+
dynamic exportedPyArray = pa.RecordBatch._import_from_c(exportArrayPtr, exportSchemaPtr);
823+
exportedPyArray._export_to_c(importArrayPtr, importSchemaPtr);
824+
}
825+
826+
Schema schema = CArrowSchemaImporter.ImportSchema(cImportSchema);
827+
RecordBatch importedBatch = CArrowArrayImporter.ImportRecordBatch(cImportArray, schema);
828+
829+
ArrowReaderVerifier.CompareBatches(batch2, importedBatch, strictCompare: false); // Non-strict because span lengths won't match.
830+
831+
// Since we allocated, we are responsible for freeing the pointer.
832+
CArrowArray.Free(cExportArray);
833+
CArrowSchema.Free(cExportSchema);
834+
CArrowArray.Free(cImportArray);
835+
CArrowSchema.Free(cImportSchema);
836+
}
837+
795838
[SkippableFact]
796839
public unsafe void ExportBatchReader()
797840
{

0 commit comments

Comments
 (0)