Merge branch 'antalya-25.8' into backports/antalya-25.8/87687

mkmkme · web-flow · commit 14076d5fa31b · 2026-01-22T16:22:27.000+01:00
diff --git a/src/Processors/Formats/Impl/Parquet/SchemaConverter.cpp b/src/Processors/Formats/Impl/Parquet/SchemaConverter.cpp
@@ -127,7 +127,7 @@ NamesAndTypesList SchemaConverter::inferSchema()
     return res;
 }
 
-std::string_view SchemaConverter::useColumnMapperIfNeeded(const parq::SchemaElement & element) const
+std::string_view SchemaConverter::useColumnMapperIfNeeded(const parq::SchemaElement & element, const String & current_path) const
 {
     if (!column_mapper)
         return element.name;
@@ -142,8 +142,19 @@ std::string_view SchemaConverter::useColumnMapperIfNeeded(const parq::SchemaElem
     auto it = map.find(element.field_id);
     if (it == map.end())
         throw Exception(ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "Parquet file has column {} with field_id {} that is not in datalake metadata", element.name, element.field_id);
-    auto split = Nested::splitName(std::string_view(it->second), /*reverse=*/ true);
-    return split.second.empty() ? split.first : split.second;
+
+    /// At top level (empty path), return the full mapped name. For nested
+    /// elements, strip the parent path prefix to get the child name.
+    if (current_path.empty())
+        return it->second;
+
+    /// Strip "current_path." prefix to get child name (preserves dots in child names)
+    std::string_view mapped = it->second;
+    if (mapped.starts_with(current_path) && mapped.size() > current_path.size()
+        && mapped[current_path.size()] == '.')
+        return mapped.substr(current_path.size() + 1);
+
+    return mapped;
 }
 
 void SchemaConverter::processSubtree(TraversalNode & node)
@@ -160,7 +171,7 @@ void SchemaConverter::processSubtree(TraversalNode & node)
 
     if (node.schema_context == SchemaContext::None)
     {
-        node.appendNameComponent(node.element->name, useColumnMapperIfNeeded(*node.element));
+        node.appendNameComponent(node.element->name, useColumnMapperIfNeeded(*node.element, node.name));
 
         if (sample_block)
         {
@@ -589,7 +600,7 @@ void SchemaConverter::processSubtreeTuple(TraversalNode & node)
     std::vector<String> element_names_in_file;
     for (size_t i = 0; i < size_t(node.element->num_children); ++i)
     {
-        const String & element_name = element_names_in_file.emplace_back(useColumnMapperIfNeeded(file_metadata.schema.at(schema_idx)));
+        const String & element_name = element_names_in_file.emplace_back(useColumnMapperIfNeeded(file_metadata.schema.at(schema_idx), node.name));
         std::optional<size_t> idx_in_output_tuple = i - skipped_unsupported_columns;
         if (lookup_by_name)
         {
diff --git a/src/Processors/Formats/Impl/Parquet/SchemaConverter.h b/src/Processors/Formats/Impl/Parquet/SchemaConverter.h
@@ -137,8 +137,10 @@ struct SchemaConverter
         DataTypePtr & out_inferred_type, std::optional<GeoColumnMetadata> geo_metadata) const;
 
     /// Returns element.name or a corresponding name from ColumnMapper.
-    /// For tuple elements, that's just the element name like `x`, not the whole path like `t.x`.
-    std::string_view useColumnMapperIfNeeded(const parq::SchemaElement & element) const;
+    /// For nested tuple elements, returns just the element name like `x`, not the whole path like `t.x`.
+    /// For top-level columns (when current_path is empty), returns the full mapped name to support
+    /// column names with dots (e.g., `integer.col` in Iceberg).
+    std::string_view useColumnMapperIfNeeded(const parq::SchemaElement & element, const String & current_path) const;
 };
 
 }
diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp
@@ -469,6 +469,15 @@ NamesAndTypesList ColumnsDescription::getInsertable() const
     return ret;
 }
 
+NamesAndTypesList ColumnsDescription::getReadable() const
+{
+    NamesAndTypesList ret;
+    for (const auto & col : columns)
+        if (col.default_desc.kind != ColumnDefaultKind::Ephemeral)
+            ret.emplace_back(col.name, col.type);
+    return ret;
+}
+
 NamesAndTypesList ColumnsDescription::getMaterialized() const
 {
     NamesAndTypesList ret;
@@ -851,7 +860,6 @@ std::optional<ColumnDefault> ColumnsDescription::getDefault(const String & colum
     return {};
 }
 
-
 bool ColumnsDescription::hasCompressionCodec(const String & column_name) const
 {
     const auto it = columns.get<1>().find(column_name);
diff --git a/src/Storages/ColumnsDescription.h b/src/Storages/ColumnsDescription.h
@@ -149,6 +149,7 @@ class ColumnsDescription : public IHints<>
     NamesAndTypesList getOrdinary() const;
     NamesAndTypesList getMaterialized() const;
     NamesAndTypesList getInsertable() const; /// ordinary + ephemeral
+    NamesAndTypesList getReadable() const; /// ordinary + materialized + aliases (no ephemeral)
     NamesAndTypesList getAliases() const;
     NamesAndTypesList getEphemeral() const;
     NamesAndTypesList getAllPhysical() const; /// ordinary + materialized.
diff --git a/src/Storages/MergeTree/ExportPartTask.cpp b/src/Storages/MergeTree/ExportPartTask.cpp
@@ -4,17 +4,20 @@
 #include <Storages/MergeTree/MergeTreeData.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/DatabaseCatalog.h>
+#include <Interpreters/inplaceBlockConversions.h>
 #include <Core/Settings.h>
 #include <Interpreters/ExpressionActions.h>
 #include <Processors/Executors/CompletedPipelineExecutor.h>
 #include <Processors/QueryPlan/BuildQueryPipelineSettings.h>
 #include <Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h>
 #include <Processors/QueryPlan/QueryPlan.h>
+#include <Processors/QueryPlan/ExpressionStep.h>
 #include <QueryPipeline/QueryPipelineBuilder.h>
 #include <Common/Exception.h>
 #include <Common/ProfileEventsScope.h>
 #include <Storages/MergeTree/ExportList.h>
 #include <Formats/FormatFactory.h>
+#include <Databases/enableAllExperimentalSettings.h>
 
 namespace ProfileEvents
 {
@@ -42,6 +45,43 @@ namespace Setting
     extern const SettingsUInt64 export_merge_tree_part_max_rows_per_file;
 }
 
+namespace
+{
+    void materializeSpecialColumns(
+        const SharedHeader & header,
+        const StorageMetadataPtr & storage_metadata,
+        const ContextPtr & local_context,
+        QueryPlan & plan_for_part
+    )
+    {
+        const auto readable_columns = storage_metadata->getColumns().getReadable();
+
+        // Enable all experimental settings for default expressions
+        // (same pattern as in IMergeTreeReader::evaluateMissingDefaults)
+        auto context_for_defaults = Context::createCopy(local_context);
+        enableAllExperimentalSettings(context_for_defaults);
+        
+        auto defaults_dag = evaluateMissingDefaults(
+            *header,
+            readable_columns,
+            storage_metadata->getColumns(),
+            context_for_defaults);
+
+        if (defaults_dag)
+        {
+            /// Ensure columns are in the correct order matching readable_columns
+            defaults_dag->removeUnusedActions(readable_columns.getNames(), false);
+            defaults_dag->addMaterializingOutputActions(/*materialize_sparse=*/ false);
+            
+            auto expression_step = std::make_unique<ExpressionStep>(
+                header,
+                std::move(*defaults_dag));
+            expression_step->setStepDescription("Compute alias and default expressions for export");
+            plan_for_part.addStep(std::move(expression_step));
+        }
+    }
+}
+
 ExportPartTask::ExportPartTask(MergeTreeData & storage_, const MergeTreePartExportManifest & manifest_)
     : storage(storage_),
     manifest(manifest_)
@@ -58,7 +98,8 @@ bool ExportPartTask::executeStep()
 
     const auto & metadata_snapshot = manifest.metadata_snapshot;
 
-    Names columns_to_read = metadata_snapshot->getColumns().getNamesOfPhysical();
+    /// Read only physical columns from the part
+    const auto columns_to_read = metadata_snapshot->getColumns().getNamesOfPhysical();
 
     MergeTreeSequentialSourceType read_type = MergeTreeSequentialSourceType::Export;
 
@@ -146,6 +187,10 @@ bool ExportPartTask::executeStep()
             local_context,
             getLogger("ExportPartition"));
 
+        /// We need to support exporting materialized and alias columns to object storage. For some reason, object storage engines don't support them.
+        /// This is a hack that materializes the columns before the export so they can be exported to tables that have matching columns
+        materializeSpecialColumns(plan_for_part.getCurrentHeader(), metadata_snapshot, local_context, plan_for_part);
+
         ThreadGroupSwitcher switcher((*exports_list_entry)->thread_group, "");
 
         QueryPlanOptimizationSettings optimization_settings(local_context);
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -6242,7 +6242,13 @@ void MergeTreeData::exportPartToTable(
     auto source_metadata_ptr = getInMemoryMetadataPtr();
     auto destination_metadata_ptr = dest_storage->getInMemoryMetadataPtr();
 
-    if (destination_metadata_ptr->getColumns().getAllPhysical().sizeOfDifference(source_metadata_ptr->getColumns().getAllPhysical()))
+    const auto & source_columns = source_metadata_ptr->getColumns();
+
+    const auto & destination_columns = destination_metadata_ptr->getColumns();
+
+    /// compare all source readable columns with all destination insertable columns
+    /// this allows us to skip ephemeral columns
+    if (source_columns.getReadable().sizeOfDifference(destination_columns.getInsertable()))
         throw Exception(ErrorCodes::INCOMPATIBLE_COLUMNS, "Tables have different structure");
 
     if (query_to_string(source_metadata_ptr->getPartitionKeyAST()) != query_to_string(destination_metadata_ptr->getPartitionKeyAST()))
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.cpp
@@ -43,6 +43,7 @@ IcebergDataObjectInfo::IcebergDataObjectInfo(Iceberg::ManifestFileEntry data_man
                        data_manifest_file_entry_.file_path_key.empty() ? std::nullopt : std::make_optional(data_manifest_file_entry_.file_path_key))
     , data_object_file_path_key(data_manifest_file_entry_.file_path_key)
     , underlying_format_read_schema_id(data_manifest_file_entry_.schema_id)
+    , file_format(data_manifest_file_entry_.file_format)
     , sequence_number(data_manifest_file_entry_.added_sequence_number)
 {
     if (!position_deletes_objects.empty() && Poco::toUpperInPlace(data_manifest_file_entry_.file_format) != "PARQUET")
@@ -59,10 +60,11 @@ IcebergDataObjectInfo::IcebergDataObjectInfo(
     ObjectStoragePtr resolved_storage,
     const String & resolved_key)
     : PathWithMetadata(resolved_key, std::nullopt,
-                       data_manifest_file_entry_.file_path.empty() ? std::nullopt : std::make_optional(data_manifest_file_entry_.file_path), 
+                       data_manifest_file_entry_.file_path.empty() ? std::nullopt : std::make_optional(data_manifest_file_entry_.file_path),
                        resolved_storage)
     , data_object_file_path_key(data_manifest_file_entry_.file_path_key)
     , underlying_format_read_schema_id(data_manifest_file_entry_.schema_id)
+    , file_format(data_manifest_file_entry_.file_format)
     , sequence_number(data_manifest_file_entry_.added_sequence_number)
 {
     if (!position_deletes_objects.empty() && Poco::toUpperInPlace(data_manifest_file_entry_.file_format) != "PARQUET")
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.h
@@ -22,7 +22,7 @@ struct IcebergDataObjectInfo : public PathWithMetadata, std::enable_shared_from_
     /// It is used to filter position deletes objects by data file path.
     /// It is also used to create a filter for the data object in the position delete transform.
     explicit IcebergDataObjectInfo(Iceberg::ManifestFileEntry data_manifest_file_entry_);
-    
+
     /// Sometimes data files are located outside the table location and even in a different storage.
     explicit IcebergDataObjectInfo(
         Iceberg::ManifestFileEntry data_manifest_file_entry_,
@@ -50,6 +50,7 @@ struct IcebergDataObjectInfo : public PathWithMetadata, std::enable_shared_from_
 
     String data_object_file_path_key; // Full path to the data object file
     Int32 underlying_format_read_schema_id;
+    String file_format; // Format of the data file (e.g., "PARQUET", "ORC", "AVRO")
     std::vector<Iceberg::PositionDeleteObject> position_deletes_objects;
     std::vector<Iceberg::ManifestFileEntry> equality_deletes_objects;
     Int64 sequence_number;
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
@@ -1077,7 +1077,7 @@ void IcebergMetadata::addDeleteTransformers(
 
             auto [delete_storage_to_use, resolved_delete_key] = resolveObjectStorageForPath(
                 persistent_components.table_location, delete_file.file_path, object_storage, *secondary_storages, local_context);
-            
+
             PathWithMetadata delete_file_object(resolved_delete_key, std::nullopt, delete_file.file_path, delete_storage_to_use);
             {
                 auto schema_read_buffer = createReadBuffer(delete_file_object, delete_storage_to_use, local_context, log);
@@ -1198,8 +1198,7 @@ ColumnMapperPtr IcebergMetadata::getColumnMapperForObject(ObjectInfoPtr object_i
     IcebergDataObjectInfo * iceberg_object_info = dynamic_cast<IcebergDataObjectInfo *>(object_info.get());
     if (!iceberg_object_info)
         return nullptr;
-    auto configuration_ptr = configuration.lock();
-    if (Poco::toLower(configuration_ptr->getFormat()) != "parquet")
+    if (Poco::toLower(iceberg_object_info->file_format) != "parquet")
         return nullptr;
 
     return persistent_components.schema_processor->getColumnMapperById(iceberg_object_info->underlying_format_read_schema_id);
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -8133,7 +8133,9 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand &
     auto src_snapshot = getInMemoryMetadataPtr();
     auto destination_snapshot = dest_storage->getInMemoryMetadataPtr();
 
-    if (destination_snapshot->getColumns().getAllPhysical().sizeOfDifference(src_snapshot->getColumns().getAllPhysical()))
+    /// compare all source readable columns with all destination insertable columns
+    /// this allows us to skip ephemeral columns
+    if (src_snapshot->getColumns().getReadable().sizeOfDifference(destination_snapshot->getColumns().getInsertable()))
         throw Exception(ErrorCodes::INCOMPATIBLE_COLUMNS, "Tables have different structure");
 
     if (query_to_string(src_snapshot->getPartitionKeyAST()) != query_to_string(destination_snapshot->getPartitionKeyAST()))
diff --git a/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py
@@ -747,3 +747,59 @@ def test_multiple_exports_within_a_single_query(cluster):
 #     # Wait for export to finish and then verify destination still reflects the original snapshot (3 rows)
 #     time.sleep(5)
 #     assert node.query(f"SELECT count() FROM {s3_table} WHERE year = 2020") == '3\n', "Export did not preserve snapshot at start time after source mutation"
+
+
+def test_export_partition_with_mixed_computed_columns(cluster):
+    """Test export partition with ALIAS, MATERIALIZED, and EPHEMERAL columns."""
+    node = cluster.instances["replica1"]
+
+    mt_table = "mixed_computed_mt_table"
+    s3_table = "mixed_computed_s3_table"
+
+    node.query(f"""
+        CREATE TABLE {mt_table} (
+            id UInt32,
+            value UInt32,
+            tag_input String EPHEMERAL,
+            doubled UInt64 ALIAS value * 2,
+            tripled UInt64 MATERIALIZED value * 3,
+            tag String DEFAULT upper(tag_input)
+        ) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{mt_table}', 'replica1')
+        PARTITION BY id
+        ORDER BY id
+        SETTINGS index_granularity = 1
+    """)
+
+    # Create S3 destination table with regular columns (no EPHEMERAL)
+    node.query(f"""
+        CREATE TABLE {s3_table} (
+            id UInt32,
+            value UInt32,
+            doubled UInt64,
+            tripled UInt64,
+            tag String
+        ) ENGINE = S3(s3_conn, filename='{s3_table}', format=Parquet, partition_strategy='hive')
+        PARTITION BY id
+    """)
+
+    node.query(f"INSERT INTO {mt_table} (id, value, tag_input) VALUES (1, 5, 'test'), (1, 10, 'prod')")
+
+    node.query(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '1' TO TABLE {s3_table}")
+
+    wait_for_export_status(node, mt_table, s3_table, "1", "COMPLETED")
+
+    # Verify source data (ALIAS computed, EPHEMERAL not stored)
+    source_result = node.query(f"SELECT id, value, doubled, tripled, tag FROM {mt_table} ORDER BY value")
+    expected = "1\t5\t10\t15\tTEST\n1\t10\t20\t30\tPROD\n"
+    assert source_result == expected, f"Source table data mismatch. Expected:\n{expected}\nGot:\n{source_result}"
+
+    dest_result = node.query(f"SELECT id, value, doubled, tripled, tag FROM {s3_table} ORDER BY value")
+    assert dest_result == expected, f"Exported data mismatch. Expected:\n{expected}\nGot:\n{dest_result}"
+
+    status = node.query(f"""
+        SELECT status FROM system.replicated_partition_exports
+        WHERE source_table = '{mt_table}'
+            AND destination_table = '{s3_table}'
+            AND partition_id = '1'
+    """)
+    assert status.strip() == "COMPLETED", f"Expected COMPLETED status, got: {status}"
diff --git a/tests/integration/test_storage_iceberg/test.py b/tests/integration/test_storage_iceberg/test.py
diff --git a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage.reference b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage.reference
diff --git a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage.sh b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage.sh
diff --git a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql

Original file line number	Diff line number	Diff line change
`@@ -469,6 +469,15 @@ NamesAndTypesList ColumnsDescription::getInsertable() const`
`469`	`469`	`return ret;`
`470`	`470`	`}`
`471`	`471`
	`472`	`+NamesAndTypesList ColumnsDescription::getReadable() const`
	`473`	`+{`
	`474`	`+ NamesAndTypesList ret;`
	`475`	`+ for (const auto & col : columns)`
	`476`	`+ if (col.default_desc.kind != ColumnDefaultKind::Ephemeral)`
	`477`	`+ ret.emplace_back(col.name, col.type);`
	`478`	`+ return ret;`
	`479`	`+}`
	`480`	`+`
`472`	`481`	`NamesAndTypesList ColumnsDescription::getMaterialized() const`
`473`	`482`	`{`
`474`	`483`	`NamesAndTypesList ret;`
`@@ -851,7 +860,6 @@ std::optional<ColumnDefault> ColumnsDescription::getDefault(const String & colum`
`851`	`860`	`return {};`
`852`	`861`	`}`
`853`	`862`
`854`		`-`
`855`	`863`	`bool ColumnsDescription::hasCompressionCodec(const String & column_name) const`
`856`	`864`	`{`
`857`	`865`	`const auto it = columns.get<1>().find(column_name);`