-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Closed
Labels
Description
Describe the bug
#[test]
fn test_read_structs() {
let testdata = arrow::util::test_util::parquet_test_data();
let path = format!("{}/nested_structs.rust.parquet", testdata);
let parquet_file_reader =
SerializedFileReader::try_from(File::open(&path).unwrap()).unwrap();
let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(parquet_file_reader));
let mut projected = arrow_reader.get_record_reader_by_columns(vec![0], 60).unwrap();
let batch = projected.next().unwrap().unwrap();
assert_eq!(projected.schema, batch.schema());
let schema = batch.schema();
assert_eq!(batch.column(0).data_type(), schema.field(0).data_type());
}
This test fails because the schema is computed based on filtering the root nodes.
i.e. It think this is the projected schema
required group field_id=-1 schema {
required group field_id=-1 roll_num {
required int64 field_id=-1 min (Int(bitWidth=64, isSigned=true));
required int64 field_id=-1 max (Int(bitWidth=64, isSigned=true));
required int64 field_id=-1 mean (Int(bitWidth=64, isSigned=true));
required int64 field_id=-1 count (Int(bitWidth=64, isSigned=false));
required int64 field_id=-1 sum (Int(bitWidth=64, isSigned=true));
required int64 field_id=-1 variance (Int(bitWidth=64, isSigned=true));
}
}
When it is actually
required group field_id=-1 schema {
required group field_id=-1 roll_num {
required int64 field_id=-1 min (Int(bitWidth=64, isSigned=true))
}
}
This is because the column indices provided to get_record_reader_by_columns is the parquet column indices, not the arrow field index.
To Reproduce
Run test, it fails
Expected behavior
The computed schema should be correct
Additional context
Related to apache/datafusion#2439 and #1651