Skip to content

Commit 7c4f4c2

Browse files
authored
GH-39855: [Python] ListView support for pa.array() (#40160)
### Rationale for this change Add pa.array() instantiation support for ListView and LargeListView formats. ### What changes are included in this PR? * pa.array() supports creating ListView and LargeListView types * ListArray, LargeListArray now have their size initialized before adding elements during python-to-arrow conversion. This allows these types to be convertible to ListViewArray and LargeListViewArray types. ### Are these changes tested? Yes, unit tested. ### Are there any user-facing changes? Yes, new feature added. * Closes: #39855 * GitHub Issue: #39855 Authored-by: Dane Pitkin <dane@voltrondata.com> Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
1 parent 2308e40 commit 7c4f4c2

5 files changed

Lines changed: 218 additions & 43 deletions

File tree

python/pyarrow/src/arrow/python/python_to_arrow.cc

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -581,7 +581,8 @@ struct PyConverterTrait<
581581
};
582582

583583
template <typename T>
584-
struct PyConverterTrait<T, enable_if_list_like<T>> {
584+
struct PyConverterTrait<
585+
T, enable_if_t<is_list_like_type<T>::value || is_list_view_type<T>::value>> {
585586
using type = PyListConverter<T>;
586587
};
587588

@@ -803,7 +804,6 @@ class PyListConverter : public ListConverter<T, PyConverter, PyConverterTrait> {
803804
return this->list_builder_->AppendNull();
804805
}
805806

806-
RETURN_NOT_OK(this->list_builder_->Append());
807807
if (PyArray_Check(value)) {
808808
RETURN_NOT_OK(AppendNdarray(value));
809809
} else if (PySequence_Check(value)) {
@@ -824,6 +824,21 @@ class PyListConverter : public ListConverter<T, PyConverter, PyConverterTrait> {
824824
}
825825

826826
protected:
827+
// MapType does not support args in the Append() method
828+
Status AppendTo(const MapType*, int64_t size) { return this->list_builder_->Append(); }
829+
830+
// FixedSizeListType does not support args in the Append() method
831+
Status AppendTo(const FixedSizeListType*, int64_t size) {
832+
return this->list_builder_->Append();
833+
}
834+
835+
// ListType requires the size argument in the Append() method
836+
// in order to be convertible to a ListViewType. ListViewType
837+
// requires the size argument in the Append() method always.
838+
Status AppendTo(const BaseListType*, int64_t size) {
839+
return this->list_builder_->Append(true, size);
840+
}
841+
827842
Status ValidateBuilder(const MapType*) {
828843
if (this->list_builder_->key_builder()->null_count() > 0) {
829844
return Status::Invalid("Invalid Map: key field cannot contain null values");
@@ -836,11 +851,14 @@ class PyListConverter : public ListConverter<T, PyConverter, PyConverterTrait> {
836851

837852
Status AppendSequence(PyObject* value) {
838853
int64_t size = static_cast<int64_t>(PySequence_Size(value));
854+
RETURN_NOT_OK(AppendTo(this->list_type_, size));
839855
RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size));
840856
return this->value_converter_->Extend(value, size);
841857
}
842858

843859
Status AppendIterable(PyObject* value) {
860+
auto size = static_cast<int64_t>(PyObject_Size(value));
861+
RETURN_NOT_OK(AppendTo(this->list_type_, size));
844862
PyObject* iterator = PyObject_GetIter(value);
845863
OwnedRef iter_ref(iterator);
846864
while (PyObject* item = PyIter_Next(iterator)) {
@@ -857,6 +875,7 @@ class PyListConverter : public ListConverter<T, PyConverter, PyConverterTrait> {
857875
return Status::Invalid("Can only convert 1-dimensional array values");
858876
}
859877
const int64_t size = PyArray_SIZE(ndarray);
878+
RETURN_NOT_OK(AppendTo(this->list_type_, size));
860879
RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size));
861880

862881
const auto value_type = this->value_converter_->builder()->type();

python/pyarrow/tests/strategies.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,9 @@ def list_types(item_strategy=primitive_types):
167167
pa.list_,
168168
item_strategy,
169169
st.integers(min_value=0, max_value=16)
170-
)
170+
),
171+
st.builds(pa.list_view, item_strategy),
172+
st.builds(pa.large_list_view, item_strategy)
171173
)
172174

173175

python/pyarrow/tests/test_array.py

Lines changed: 136 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -627,7 +627,8 @@ def test_string_binary_from_buffers():
627627
assert copied.null_count == 0
628628

629629

630-
@pytest.mark.parametrize('list_type_factory', [pa.list_, pa.large_list])
630+
@pytest.mark.parametrize('list_type_factory', [
631+
pa.list_, pa.large_list, pa.list_view, pa.large_list_view])
631632
def test_list_from_buffers(list_type_factory):
632633
ty = list_type_factory(pa.int16())
633634
array = pa.array([[0, 1, 2], None, [], [3, 4, 5]], type=ty)
@@ -637,15 +638,15 @@ def test_list_from_buffers(list_type_factory):
637638

638639
with pytest.raises(ValueError):
639640
# No children
640-
pa.Array.from_buffers(ty, 4, [None, buffers[1]])
641+
pa.Array.from_buffers(ty, 4, buffers[:ty.num_buffers])
641642

642-
child = pa.Array.from_buffers(pa.int16(), 6, buffers[2:])
643-
copied = pa.Array.from_buffers(ty, 4, buffers[:2], children=[child])
643+
child = pa.Array.from_buffers(pa.int16(), 6, buffers[ty.num_buffers:])
644+
copied = pa.Array.from_buffers(ty, 4, buffers[:ty.num_buffers], children=[child])
644645
assert copied.equals(array)
645646

646647
with pytest.raises(ValueError):
647648
# too many children
648-
pa.Array.from_buffers(ty, 4, [None, buffers[1]],
649+
pa.Array.from_buffers(ty, 4, buffers[:ty.num_buffers],
649650
children=[child, child])
650651

651652

@@ -2022,6 +2023,9 @@ def test_cast_identities(ty, values):
20222023
([[1, 2], [3]], pa.list_(pa.int64())),
20232024
([[4, 5], [6]], pa.large_list(pa.int16())),
20242025
([['a'], None, ['b', 'c']], pa.list_(pa.string())),
2026+
([[1, 2], [3]], pa.list_view(pa.int64())),
2027+
([[4, 5], [6]], pa.large_list_view(pa.int16())),
2028+
([['a'], None, ['b', 'c']], pa.list_view(pa.string())),
20252029
([(1, 'a'), (2, 'c'), None],
20262030
pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))
20272031
]
@@ -3575,9 +3579,10 @@ def test_run_end_encoded_from_buffers():
35753579
1, offset, children)
35763580

35773581

3578-
@pytest.mark.parametrize(('list_array_type'),
3579-
[pa.ListViewArray, pa.LargeListViewArray])
3580-
def test_list_view_from_arrays(list_array_type):
3582+
@pytest.mark.parametrize(('list_array_type', 'list_type_factory'),
3583+
[(pa.ListViewArray, pa.list_view),
3584+
(pa.LargeListViewArray, pa.large_list_view)])
3585+
def test_list_view_from_arrays(list_array_type, list_type_factory):
35813586
# test in order offsets, similar to ListArray representation
35823587
values = [1, 2, 3, 4, 5, 6, None, 7]
35833588
offsets = [0, 2, 4, 6]
@@ -3589,6 +3594,17 @@ def test_list_view_from_arrays(list_array_type):
35893594
assert array.offsets.to_pylist() == offsets
35903595
assert array.sizes.to_pylist() == sizes
35913596

3597+
# with specified type
3598+
typ = list_type_factory(pa.field("name", pa.int64()))
3599+
result = list_array_type.from_arrays(offsets, sizes, values, typ)
3600+
assert result.type == typ
3601+
assert result.type.value_field.name == "name"
3602+
3603+
# with mismatching type
3604+
typ = list_type_factory(pa.binary())
3605+
with pytest.raises(TypeError):
3606+
list_array_type.from_arrays(offsets, sizes, values, type=typ)
3607+
35923608
# test out of order offsets with overlapping values
35933609
values = [1, 2, 3, 4]
35943610
offsets = [2, 1, 0]
@@ -3635,12 +3651,121 @@ def test_list_view_from_arrays(list_array_type):
36353651
assert array.sizes.to_pylist() == sizes
36363652

36373653

3638-
@pytest.mark.parametrize(('list_array_type'),
3639-
[pa.ListViewArray, pa.LargeListViewArray])
3640-
def test_list_view_flatten(list_array_type):
3654+
@pytest.mark.parametrize(('list_array_type', 'list_type_factory'),
3655+
[(pa.ListViewArray, pa.list_view),
3656+
(pa.LargeListViewArray, pa.large_list_view)])
3657+
def test_list_view_from_arrays_fails(list_array_type, list_type_factory):
3658+
values = [1, 2]
3659+
offsets = [0, 1, None]
3660+
sizes = [1, 1, 0]
3661+
mask = pa.array([False, False, True])
3662+
3663+
# Ambiguous to specify both validity map and offsets or sizes with nulls
3664+
with pytest.raises(pa.lib.ArrowInvalid):
3665+
list_array_type.from_arrays(offsets, sizes, values, mask=mask)
3666+
3667+
offsets = [0, 1, 1]
3668+
array = list_array_type.from_arrays(offsets, sizes, values, mask=mask)
3669+
array_slice = array[1:]
3670+
3671+
# List offsets and sizes must not be slices if a validity map is specified
3672+
with pytest.raises(pa.lib.ArrowInvalid):
3673+
list_array_type.from_arrays(
3674+
array_slice.offsets, array_slice.sizes,
3675+
array_slice.values, mask=array_slice.is_null())
3676+
3677+
3678+
@pytest.mark.parametrize(('list_array_type', 'list_type_factory', 'offset_type'),
3679+
[(pa.ListViewArray, pa.list_view, pa.int32()),
3680+
(pa.LargeListViewArray, pa.large_list_view, pa.int64())])
3681+
def test_list_view_flatten(list_array_type, list_type_factory, offset_type):
3682+
arr0 = pa.array([
3683+
1, None, 2,
3684+
3, 4,
3685+
5, 6,
3686+
7, 8
3687+
], type=pa.int64())
3688+
3689+
typ1 = list_type_factory(pa.int64())
3690+
arr1 = pa.array([
3691+
[1, None, 2],
3692+
None,
3693+
[3, 4],
3694+
[],
3695+
[5, 6],
3696+
None,
3697+
[7, 8]
3698+
], type=typ1)
3699+
offsets1 = pa.array([0, 3, 3, 5, 5, 7, 7], type=offset_type)
3700+
sizes1 = pa.array([3, 0, 2, 0, 2, 0, 2], type=offset_type)
3701+
3702+
typ2 = list_type_factory(
3703+
list_type_factory(
3704+
pa.int64()
3705+
)
3706+
)
3707+
arr2 = pa.array([
3708+
None,
3709+
[
3710+
[1, None, 2],
3711+
None,
3712+
[3, 4]
3713+
],
3714+
[],
3715+
[
3716+
[],
3717+
[5, 6],
3718+
None
3719+
],
3720+
[
3721+
[7, 8]
3722+
]
3723+
], type=typ2)
3724+
offsets2 = pa.array([0, 0, 3, 3, 6], type=offset_type)
3725+
sizes2 = pa.array([0, 3, 0, 3, 1], type=offset_type)
3726+
3727+
assert arr1.flatten().equals(arr0)
3728+
assert arr1.offsets.equals(offsets1)
3729+
assert arr1.sizes.equals(sizes1)
3730+
assert arr1.values.equals(arr0)
3731+
assert arr2.flatten().equals(arr1)
3732+
assert arr2.offsets.equals(offsets2)
3733+
assert arr2.sizes.equals(sizes2)
3734+
assert arr2.values.equals(arr1)
3735+
assert arr2.flatten().flatten().equals(arr0)
3736+
assert arr2.values.values.equals(arr0)
3737+
3738+
# test out of order offsets
36413739
values = [1, 2, 3, 4]
36423740
offsets = [3, 2, 1, 0]
36433741
sizes = [1, 1, 1, 1]
36443742
array = list_array_type.from_arrays(offsets, sizes, values)
36453743

36463744
assert array.flatten().to_pylist() == [4, 3, 2, 1]
3745+
3746+
# test null elements backed by non-empty sublists
3747+
mask = pa.array([False, False, False, True])
3748+
array = list_array_type.from_arrays(offsets, sizes, values, mask=mask)
3749+
3750+
assert array.flatten().to_pylist() == [4, 3, 2]
3751+
assert array.values.to_pylist() == [1, 2, 3, 4]
3752+
3753+
3754+
@pytest.mark.parametrize('list_view_type', [pa.ListViewArray, pa.LargeListViewArray])
3755+
def test_list_view_slice(list_view_type):
3756+
# sliced -> values keeps referring to full values buffer, but offsets is
3757+
# sliced as well so the offsets correctly point into the full values array
3758+
# sliced -> flatten() will return the sliced value array.
3759+
3760+
array = list_view_type.from_arrays(offsets=[0, 3, 4], sizes=[
3761+
3, 1, 2], values=[1, 2, 3, 4, 5, 6])
3762+
sliced_array = array[1:]
3763+
3764+
assert sliced_array.values.to_pylist() == [1, 2, 3, 4, 5, 6]
3765+
assert sliced_array.offsets.to_pylist() == [3, 4]
3766+
assert sliced_array.flatten().to_pylist() == [4, 5, 6]
3767+
3768+
i = sliced_array.offsets[0].as_py()
3769+
j = sliced_array.offsets[1].as_py()
3770+
3771+
assert sliced_array[0].as_py() == sliced_array.values[i:j].to_pylist() == [4]

python/pyarrow/tests/test_convert_builtin.py

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -252,21 +252,17 @@ def test_nested_lists(seq):
252252
assert arr.null_count == 1
253253
assert arr.type == pa.list_(pa.int64())
254254
assert arr.to_pylist() == data
255-
# With explicit type
256-
arr = pa.array(seq(data), type=pa.list_(pa.int32()))
257-
assert len(arr) == 3
258-
assert arr.null_count == 1
259-
assert arr.type == pa.list_(pa.int32())
260-
assert arr.to_pylist() == data
261255

262256

263257
@parametrize_with_sequence_types
264-
def test_nested_large_lists(seq):
258+
@pytest.mark.parametrize("factory", [
259+
pa.list_, pa.large_list, pa.list_view, pa.large_list_view])
260+
def test_nested_lists_with_explicit_type(seq, factory):
265261
data = [[], [1, 2], None]
266-
arr = pa.array(seq(data), type=pa.large_list(pa.int16()))
262+
arr = pa.array(seq(data), type=factory(pa.int16()))
267263
assert len(arr) == 3
268264
assert arr.null_count == 1
269-
assert arr.type == pa.large_list(pa.int16())
265+
assert arr.type == factory(pa.int16())
270266
assert arr.to_pylist() == data
271267

272268

@@ -277,15 +273,22 @@ def test_list_with_non_list(seq):
277273
pa.array(seq([[], [1, 2], 3]), type=pa.list_(pa.int64()))
278274
with pytest.raises(TypeError):
279275
pa.array(seq([[], [1, 2], 3]), type=pa.large_list(pa.int64()))
276+
with pytest.raises(TypeError):
277+
pa.array(seq([[], [1, 2], 3]), type=pa.list_view(pa.int64()))
278+
with pytest.raises(TypeError):
279+
pa.array(seq([[], [1, 2], 3]), type=pa.large_list_view(pa.int64()))
280280

281281

282282
@parametrize_with_sequence_types
283-
def test_nested_arrays(seq):
283+
@pytest.mark.parametrize("factory", [
284+
pa.list_, pa.large_list, pa.list_view, pa.large_list_view])
285+
def test_nested_arrays(seq, factory):
284286
arr = pa.array(seq([np.array([], dtype=np.int64),
285-
np.array([1, 2], dtype=np.int64), None]))
287+
np.array([1, 2], dtype=np.int64), None]),
288+
type=factory(pa.int64()))
286289
assert len(arr) == 3
287290
assert arr.null_count == 1
288-
assert arr.type == pa.list_(pa.int64())
291+
assert arr.type == factory(pa.int64())
289292
assert arr.to_pylist() == [[], [1, 2], None]
290293

291294

@@ -1464,9 +1467,18 @@ def test_sequence_duration_nested_lists():
14641467
assert arr.type == pa.list_(pa.duration('us'))
14651468
assert arr.to_pylist() == data
14661469

1467-
arr = pa.array(data, type=pa.list_(pa.duration('ms')))
1470+
1471+
@pytest.mark.parametrize("factory", [
1472+
pa.list_, pa.large_list, pa.list_view, pa.large_list_view])
1473+
def test_sequence_duration_nested_lists_with_explicit_type(factory):
1474+
td1 = datetime.timedelta(1, 1, 1000)
1475+
td2 = datetime.timedelta(1, 100)
1476+
1477+
data = [[td1, None], [td1, td2]]
1478+
1479+
arr = pa.array(data, type=factory(pa.duration('ms')))
14681480
assert len(arr) == 2
1469-
assert arr.type == pa.list_(pa.duration('ms'))
1481+
assert arr.type == factory(pa.duration('ms'))
14701482
assert arr.to_pylist() == data
14711483

14721484

@@ -2430,6 +2442,10 @@ def test_array_from_pylist_offset_overflow():
24302442
),
24312443
([[1, 2, 3]], [pa.scalar([1, 2, 3])], pa.list_(pa.int64())),
24322444
([["a", "b"]], [pa.scalar(["a", "b"])], pa.list_(pa.string())),
2445+
([[1, 2, 3]], [pa.scalar([1, 2, 3], type=pa.list_view(pa.int64()))],
2446+
pa.list_view(pa.int64())),
2447+
([["a", "b"]], [pa.scalar(["a", "b"], type=pa.list_view(pa.string()))],
2448+
pa.list_view(pa.string())),
24332449
(
24342450
[1, 2, None],
24352451
[pa.scalar(1, type=pa.int8()), pa.scalar(2, type=pa.int8()), None],

0 commit comments

Comments
 (0)