Skip to content

Commit 458f063

Browse files
tpboudreauwesm
authored andcommitted
Add property showing converted type origin to TimestampLogicalType
1 parent d7b125e commit 458f063

6 files changed

Lines changed: 124 additions & 49 deletions

File tree

cpp/src/arrow/ipc/metadata-internal.cc

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -360,10 +360,9 @@ static Status TypeFromFlatbuffer(const flatbuf::Field* field,
360360
auto type_data = field->type();
361361
if (type_data == nullptr) {
362362
return Status::IOError(
363-
"Type-pointer in custom metadata of flatbuffer-encoded Field is null.");
363+
"Type-pointer in custom metadata of flatbuffer-encoded Field is null.");
364364
}
365-
RETURN_NOT_OK(
366-
ConcreteTypeFromFlatbuffer(field->type_type(), type_data, children, out));
365+
RETURN_NOT_OK(ConcreteTypeFromFlatbuffer(field->type_type(), type_data, children, out));
367366

368367
// Look for extension metadata in custom_metadata field
369368
// TODO(wesm): Should this be part of the Field Flatbuffers table?
@@ -766,8 +765,8 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, DictionaryMemo* dictiona
766765
auto int_data = encoding->indexType();
767766
if (int_data == nullptr) {
768767
return Status::IOError(
769-
"indexType-pointer in custom metadata of flatbuffer-encoded DictionaryEncoding "
770-
"is null.");
768+
"indexType-pointer in custom metadata of flatbuffer-encoded DictionaryEncoding "
769+
"is null.");
771770
}
772771
RETURN_NOT_OK(IntFromFlatbuffer(int_data, &index_type));
773772
type = ::arrow::dictionary(index_type, type, encoding->isOrdered());
@@ -1155,7 +1154,7 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>* type
11551154
auto type_data = tensor->type();
11561155
if (type_data == nullptr) {
11571156
return Status::IOError(
1158-
"Type-pointer in custom metadata of flatbuffer-encoded Tensor is null.");
1157+
"Type-pointer in custom metadata of flatbuffer-encoded Tensor is null.");
11591158
}
11601159
return ConcreteTypeFromFlatbuffer(tensor->type_type(), type_data, {}, type);
11611160
}
@@ -1204,10 +1203,9 @@ Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>
12041203
auto type_data = sparse_tensor->type();
12051204
if (type_data == nullptr) {
12061205
return Status::IOError(
1207-
"Type-pointer in custom metadata of flatbuffer-encoded SparseTensor is null.");
1206+
"Type-pointer in custom metadata of flatbuffer-encoded SparseTensor is null.");
12081207
}
1209-
return ConcreteTypeFromFlatbuffer(sparse_tensor->type_type(), type_data, {},
1210-
type);
1208+
return ConcreteTypeFromFlatbuffer(sparse_tensor->type_type(), type_data, {}, type);
12111209
}
12121210

12131211
// ----------------------------------------------------------------------

cpp/src/parquet/arrow/arrow-schema-test.cc

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -116,14 +116,14 @@ TEST_F(TestConvertParquetSchema, ParquetFlatPrimitives) {
116116
parquet_fields.push_back(PrimitiveNode::Make("timestamp", Repetition::REQUIRED,
117117
ParquetType::INT64,
118118
ConvertedType::TIMESTAMP_MILLIS));
119-
arrow_fields.push_back(std::make_shared<Field>(
120-
"timestamp", ::arrow::timestamp(TimeUnit::MILLI, "UTC"), false));
119+
arrow_fields.push_back(
120+
std::make_shared<Field>("timestamp", ::arrow::timestamp(TimeUnit::MILLI), false));
121121

122122
parquet_fields.push_back(PrimitiveNode::Make("timestamp[us]", Repetition::REQUIRED,
123123
ParquetType::INT64,
124124
ConvertedType::TIMESTAMP_MICROS));
125125
arrow_fields.push_back(std::make_shared<Field>(
126-
"timestamp[us]", ::arrow::timestamp(TimeUnit::MICRO, "UTC"), false));
126+
"timestamp[us]", ::arrow::timestamp(TimeUnit::MICRO), false));
127127

128128
parquet_fields.push_back(PrimitiveNode::Make("date", Repetition::REQUIRED,
129129
ParquetType::INT32, ConvertedType::DATE));
@@ -856,15 +856,18 @@ TEST_F(TestConvertArrowSchema, ArrowFields) {
856856
LogicalType::Time(true, LogicalType::TimeUnit::NANOS), ParquetType::INT64, -1},
857857
{"timestamp(millisecond)", ::arrow::timestamp(::arrow::TimeUnit::MILLI),
858858
LogicalType::Timestamp(false, LogicalType::TimeUnit::MILLIS,
859+
/*is_from_converted_type=*/false,
859860
/*force_set_converted_type=*/true),
860861
ParquetType::INT64, -1},
861862
{"timestamp(microsecond)", ::arrow::timestamp(::arrow::TimeUnit::MICRO),
862863
LogicalType::Timestamp(false, LogicalType::TimeUnit::MICROS,
864+
/*is_from_converted_type=*/false,
863865
/*force_set_converted_type=*/true),
864866
ParquetType::INT64, -1},
865867
// Parquet v1, values converted to microseconds
866868
{"timestamp(nanosecond)", ::arrow::timestamp(::arrow::TimeUnit::NANO),
867869
LogicalType::Timestamp(false, LogicalType::TimeUnit::MICROS,
870+
/*is_from_converted_type=*/false,
868871
/*force_set_converted_type=*/true),
869872
ParquetType::INT64, -1},
870873
{"timestamp(millisecond, UTC)", ::arrow::timestamp(::arrow::TimeUnit::MILLI, "UTC"),

cpp/src/parquet/arrow/schema.cc

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -134,23 +134,22 @@ static Status MakeArrowTime64(const LogicalType& logical_type,
134134

135135
static Status MakeArrowTimestamp(const LogicalType& logical_type,
136136
std::shared_ptr<ArrowType>* out) {
137-
static const char* utc = "UTC";
138137
const auto& timestamp = checked_cast<const TimestampLogicalType&>(logical_type);
138+
const bool utc_normalized =
139+
timestamp.is_from_converted_type() ? false : timestamp.is_adjusted_to_utc();
140+
static const char* utc_timezone = "UTC";
139141
switch (timestamp.time_unit()) {
140142
case LogicalType::TimeUnit::MILLIS:
141-
*out = (timestamp.is_adjusted_to_utc()
142-
? ::arrow::timestamp(::arrow::TimeUnit::MILLI, utc)
143-
: ::arrow::timestamp(::arrow::TimeUnit::MILLI));
143+
*out = (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::MILLI, utc_timezone)
144+
: ::arrow::timestamp(::arrow::TimeUnit::MILLI));
144145
break;
145146
case LogicalType::TimeUnit::MICROS:
146-
*out = (timestamp.is_adjusted_to_utc()
147-
? ::arrow::timestamp(::arrow::TimeUnit::MICRO, utc)
148-
: ::arrow::timestamp(::arrow::TimeUnit::MICRO));
147+
*out = (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::MICRO, utc_timezone)
148+
: ::arrow::timestamp(::arrow::TimeUnit::MICRO));
149149
break;
150150
case LogicalType::TimeUnit::NANOS:
151-
*out = (timestamp.is_adjusted_to_utc()
152-
? ::arrow::timestamp(::arrow::TimeUnit::NANO, utc)
153-
: ::arrow::timestamp(::arrow::TimeUnit::NANO));
151+
*out = (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::NANO, utc_timezone)
152+
: ::arrow::timestamp(::arrow::TimeUnit::NANO));
154153
break;
155154
default:
156155
return Status::TypeError("Unrecognized time unit in timestamp logical_type: ",
@@ -530,9 +529,11 @@ static std::shared_ptr<const LogicalType> TimestampLogicalTypeFromArrowTimestamp
530529
switch (time_unit) {
531530
case ::arrow::TimeUnit::MILLI:
532531
return LogicalType::Timestamp(utc, LogicalType::TimeUnit::MILLIS,
532+
/*is_from_converted_type=*/false,
533533
/*force_set_converted_type=*/true);
534534
case ::arrow::TimeUnit::MICRO:
535535
return LogicalType::Timestamp(utc, LogicalType::TimeUnit::MICROS,
536+
/*is_from_converted_type=*/false,
536537
/*force_set_converted_type=*/true);
537538
case ::arrow::TimeUnit::NANO:
538539
return LogicalType::Timestamp(utc, LogicalType::TimeUnit::NANOS);

cpp/src/parquet/schema-test.cc

Lines changed: 59 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1398,28 +1398,34 @@ TEST(TestLogicalTypeOperation, LogicalTypeRepresentation) {
13981398
R"({"Type": "Time", "isAdjustedToUTC": false, "timeUnit": "nanoseconds"})"},
13991399
{LogicalType::Timestamp(true, LogicalType::TimeUnit::MILLIS),
14001400
"Timestamp(isAdjustedToUTC=true, timeUnit=milliseconds, "
1401-
"force_set_converted_type=false)",
1402-
R"({"Type": "Timestamp", "isAdjustedToUTC": true, "timeUnit": "milliseconds", "force_set_converted_type": false})"},
1401+
"is_from_converted_type=false, force_set_converted_type=false)",
1402+
R"({"Type": "Timestamp", "isAdjustedToUTC": true, "timeUnit": "milliseconds", )"
1403+
R"("is_from_converted_type": false, "force_set_converted_type": false})"},
14031404
{LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS),
14041405
"Timestamp(isAdjustedToUTC=true, timeUnit=microseconds, "
1405-
"force_set_converted_type=false)",
1406-
R"({"Type": "Timestamp", "isAdjustedToUTC": true, "timeUnit": "microseconds", "force_set_converted_type": false})"},
1406+
"is_from_converted_type=false, force_set_converted_type=false)",
1407+
R"({"Type": "Timestamp", "isAdjustedToUTC": true, "timeUnit": "microseconds", )"
1408+
R"("is_from_converted_type": false, "force_set_converted_type": false})"},
14071409
{LogicalType::Timestamp(true, LogicalType::TimeUnit::NANOS),
14081410
"Timestamp(isAdjustedToUTC=true, timeUnit=nanoseconds, "
1409-
"force_set_converted_type=false)",
1410-
R"({"Type": "Timestamp", "isAdjustedToUTC": true, "timeUnit": "nanoseconds", "force_set_converted_type": false})"},
1411-
{LogicalType::Timestamp(false, LogicalType::TimeUnit::MILLIS, true),
1411+
"is_from_converted_type=false, force_set_converted_type=false)",
1412+
R"({"Type": "Timestamp", "isAdjustedToUTC": true, "timeUnit": "nanoseconds", )"
1413+
R"("is_from_converted_type": false, "force_set_converted_type": false})"},
1414+
{LogicalType::Timestamp(false, LogicalType::TimeUnit::MILLIS, true, true),
14121415
"Timestamp(isAdjustedToUTC=false, timeUnit=milliseconds, "
1413-
"force_set_converted_type=true)",
1414-
R"({"Type": "Timestamp", "isAdjustedToUTC": false, "timeUnit": "milliseconds", "force_set_converted_type": true})"},
1416+
"is_from_converted_type=true, force_set_converted_type=true)",
1417+
R"({"Type": "Timestamp", "isAdjustedToUTC": false, "timeUnit": "milliseconds", )"
1418+
R"("is_from_converted_type": true, "force_set_converted_type": true})"},
14151419
{LogicalType::Timestamp(false, LogicalType::TimeUnit::MICROS),
14161420
"Timestamp(isAdjustedToUTC=false, timeUnit=microseconds, "
1417-
"force_set_converted_type=false)",
1418-
R"({"Type": "Timestamp", "isAdjustedToUTC": false, "timeUnit": "microseconds", "force_set_converted_type": false})"},
1421+
"is_from_converted_type=false, force_set_converted_type=false)",
1422+
R"({"Type": "Timestamp", "isAdjustedToUTC": false, "timeUnit": "microseconds", )"
1423+
R"("is_from_converted_type": false, "force_set_converted_type": false})"},
14191424
{LogicalType::Timestamp(false, LogicalType::TimeUnit::NANOS),
14201425
"Timestamp(isAdjustedToUTC=false, timeUnit=nanoseconds, "
1421-
"force_set_converted_type=false)",
1422-
R"({"Type": "Timestamp", "isAdjustedToUTC": false, "timeUnit": "nanoseconds", "force_set_converted_type": false})"},
1426+
"is_from_converted_type=false, force_set_converted_type=false)",
1427+
R"({"Type": "Timestamp", "isAdjustedToUTC": false, "timeUnit": "nanoseconds", )"
1428+
R"("is_from_converted_type": false, "force_set_converted_type": false})"},
14231429
{LogicalType::Interval(), "Interval", R"({"Type": "Interval"})"},
14241430
{LogicalType::Int(8, false), "Int(bitWidth=8, isSigned=false)",
14251431
R"({"Type": "Int", "bitWidth": 8, "isSigned": false})"},
@@ -1673,6 +1679,16 @@ struct SchemaElementConstructionArguments {
16731679
std::function<bool()> check_logicalType;
16741680
};
16751681

1682+
struct LegacySchemaElementConstructionArguments {
1683+
std::string name;
1684+
Type::type physical_type;
1685+
int physical_length;
1686+
bool expect_converted_type;
1687+
ConvertedType::type converted_type;
1688+
bool expect_logicalType;
1689+
std::function<bool()> check_logicalType;
1690+
};
1691+
16761692
class TestSchemaElementConstruction : public ::testing::Test {
16771693
public:
16781694
TestSchemaElementConstruction* Reconstruct(
@@ -1692,6 +1708,23 @@ class TestSchemaElementConstruction : public ::testing::Test {
16921708
return this;
16931709
}
16941710

1711+
TestSchemaElementConstruction* LegacyReconstruct(
1712+
const LegacySchemaElementConstructionArguments& c) {
1713+
// Make node, create serializable Thrift object from it ...
1714+
node_ = PrimitiveNode::Make(c.name, Repetition::REQUIRED, c.physical_type,
1715+
c.converted_type, c.physical_length);
1716+
element_.reset(new format::SchemaElement);
1717+
node_->ToParquet(element_.get());
1718+
1719+
// ... then set aside some values for later inspection.
1720+
name_ = c.name;
1721+
expect_converted_type_ = c.expect_converted_type;
1722+
converted_type_ = c.converted_type;
1723+
expect_logicalType_ = c.expect_logicalType;
1724+
check_logicalType_ = c.check_logicalType;
1725+
return this;
1726+
}
1727+
16951728
void Inspect() {
16961729
ASSERT_EQ(element_->name, name_);
16971730
if (expect_converted_type_) {
@@ -1777,6 +1810,17 @@ TEST_F(TestSchemaElementConstruction, SimpleCases) {
17771810
for (const SchemaElementConstructionArguments& c : cases) {
17781811
this->Reconstruct(c)->Inspect();
17791812
}
1813+
1814+
std::vector<LegacySchemaElementConstructionArguments> legacy_cases = {
1815+
{"timestamp_ms", Type::INT64, -1, true, ConvertedType::TIMESTAMP_MILLIS, false,
1816+
check_nothing},
1817+
{"timestamp_us", Type::INT64, -1, true, ConvertedType::TIMESTAMP_MICROS, false,
1818+
check_nothing},
1819+
};
1820+
1821+
for (const LegacySchemaElementConstructionArguments& c : legacy_cases) {
1822+
this->LegacyReconstruct(c)->Inspect();
1823+
}
17801824
}
17811825

17821826
class TestDecimalSchemaElementConstruction : public TestSchemaElementConstruction {
@@ -1920,6 +1964,7 @@ TEST_F(TestTemporalSchemaElementConstruction, TemporalCases) {
19201964
Type::INT64, -1, false, ConvertedType::NA, true, check_TIMESTAMP},
19211965
{"timestamp_F_ms_force",
19221966
LogicalType::Timestamp(false, LogicalType::TimeUnit::MILLIS,
1967+
/*is_from_converted_type=*/false,
19231968
/*force_set_converted_type=*/true),
19241969
Type::INT64, -1, true, ConvertedType::TIMESTAMP_MILLIS, true, check_TIMESTAMP},
19251970
{"timestamp_T_us", LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS),
@@ -1928,6 +1973,7 @@ TEST_F(TestTemporalSchemaElementConstruction, TemporalCases) {
19281973
Type::INT64, -1, false, ConvertedType::NA, true, check_TIMESTAMP},
19291974
{"timestamp_F_us_force",
19301975
LogicalType::Timestamp(false, LogicalType::TimeUnit::MILLIS,
1976+
/*is_from_converted_type=*/false,
19311977
/*force_set_converted_type=*/true),
19321978
Type::INT64, -1, true, ConvertedType::TIMESTAMP_MILLIS, true, check_TIMESTAMP},
19331979
{"timestamp_T_ns", LogicalType::Timestamp(true, LogicalType::TimeUnit::NANOS),

0 commit comments

Comments
 (0)