@@ -283,7 +283,7 @@ std::optional<Int32> IcebergMetadata::getSchemaVersionByFileIfOutdated(String da
283283 auto manifest_file_it = manifest_file_by_data_file.find (data_path);
284284 if (manifest_file_it == manifest_file_by_data_file.end ())
285285 {
286- throw Exception (ErrorCodes::BAD_ARGUMENTS, " Cannot find schema version for data file: {}" , data_path);
286+ throw Exception (ErrorCodes::BAD_ARGUMENTS, " Cannot find manifest file for data file: {}" , data_path);
287287 }
288288 const ManifestFileContent & manifest_file = *manifest_file_it->second ;
289289 auto schema_id = manifest_file.getSchemaId ();
@@ -335,6 +335,30 @@ ManifestList IcebergMetadata::initializeManifestList(const String & filename) co
335335 auto manifest_list_file_reader
336336 = std::make_unique<avro::DataFileReaderBase>(std::make_unique<AvroInputStreamReadBufferAdapter>(*manifest_list_buf));
337337
338+
339+ LOG_DEBUG (&Poco::Logger::get (" IcebergMetadata, ManifestList" ), " dataSchema: {}" , manifest_list_file_reader->dataSchema ().toJson (true ));
340+
341+ std::stringstream data_schema_root_ss;
342+ manifest_list_file_reader->dataSchema ().root ()->printJson (data_schema_root_ss, 10 );
343+ LOG_DEBUG (&Poco::Logger::get (" IcebergMetadata, ManifestList" ), " dataSchema root: {}" , data_schema_root_ss.str ());
344+
345+ LOG_DEBUG (
346+ &Poco::Logger::get (" IcebergMetadata, ManifestList" ),
347+ " dataSchema root type: {}" ,
348+ manifest_list_file_reader->dataSchema ().root ()->type ());
349+
350+ for (size_t i = 0 ; i < manifest_list_file_reader->dataSchema ().root ()->leaves (); ++i)
351+ {
352+ const auto & field = manifest_list_file_reader->dataSchema ().root ()->leafAt (static_cast <int >(i));
353+
354+ const auto & field_name = manifest_list_file_reader->dataSchema ().root ()->nameAt (static_cast <int >(i));
355+
356+ std::stringstream ss;
357+ field->printJson (ss, 10 );
358+ LOG_DEBUG (&Poco::Logger::get (" IcebergMetadata, ManifestList" ), " field: {}" , ss.str ());
359+ LOG_DEBUG (&Poco::Logger::get (" IcebergMetadata, ManifestList" ), " field name: {}" , field_name);
360+ }
361+
338362 auto [name_to_index, name_to_data_type, header] = getColumnsAndTypesFromAvroByNames (
339363 manifest_list_file_reader->dataSchema ().root (),
340364 {" manifest_path" , " sequence_number" },
@@ -384,9 +408,16 @@ ManifestList IcebergMetadata::initializeManifestList(const String & filename) co
384408 {
385409 added_sequence_number = sequence_number_column.value ()->getInt (i);
386410 }
387- auto [manifest_file_iterator, _inserted]
388- = manifest_files_by_name.emplace (current_filename, initializeManifestFile (current_filename, added_sequence_number));
389- manifest_list.push_back (ManifestListFileEntry{ManifestFileIterator{manifest_file_iterator}, added_sequence_number});
411+ // / We can't encapsulate this logic in getManifestFile because we need not only the name of the file, but also an inherited sequence number which is known only during the parsing of ManifestList
412+ auto manifest_file_content = initializeManifestFile (current_filename, added_sequence_number);
413+ auto [iterator, _inserted] = manifest_files_by_name.emplace (current_filename, std::move (manifest_file_content));
414+ auto manifest_file_iterator = ManifestFileIterator{iterator};
415+ for (const auto & data_file_path : manifest_file_iterator->getFiles ())
416+ {
417+ if (std::holds_alternative<DataFileEntry>(data_file_path.file ))
418+ manifest_file_by_data_file.emplace (std::get<DataFileEntry>(data_file_path.file ).file_name , manifest_file_iterator);
419+ }
420+ manifest_list.push_back (ManifestListFileEntry{manifest_file_iterator, added_sequence_number});
390421 }
391422
392423 return manifest_list;
@@ -421,6 +452,14 @@ ManifestFileIterator IcebergMetadata::getManifestFile(const String & filename) c
421452 throw Exception (ErrorCodes::BAD_ARGUMENTS, " Cannot find manifest file: {}" , filename);
422453}
423454
455+ std::optional<ManifestFileIterator> IcebergMetadata::tryGetManifestFile (const String & filename) const
456+ {
457+ auto manifest_file_it = manifest_files_by_name.find (filename);
458+ if (manifest_file_it != manifest_files_by_name.end ())
459+ return ManifestFileIterator{manifest_file_it};
460+ return std::nullopt ;
461+ }
462+
424463ManifestListIterator IcebergMetadata::getManifestList (const String & filename) const
425464{
426465 auto manifest_file_it = manifest_lists_by_name.find (filename);
@@ -464,10 +503,10 @@ Strings IcebergMetadata::getDataFilesImpl(const ActionsDAG * filter_dag) const
464503 return cached_unprunned_files_for_current_snapshot.value ();
465504
466505 Strings data_files;
467- for (const auto & manifest_entry : *current_snapshot->manifest_list_iterator )
506+ for (const auto & manifest_list_entry : *( current_snapshot->manifest_list_iterator ) )
468507 {
469508 const auto & partition_columns_ids
470- = getRelevantPartitionColumnIds (manifest_entry .manifest_file , schema_processor, current_schema_id);
509+ = getRelevantPartitionColumnIds (manifest_list_entry .manifest_file , schema_processor, current_schema_id);
471510 const auto & partition_pruning_columns_names_and_types
472511 = schema_processor.tryGetFieldsCharacteristics (current_schema_id, partition_columns_ids);
473512
@@ -476,7 +515,7 @@ Strings IcebergMetadata::getDataFilesImpl(const ActionsDAG * filter_dag) const
476515 const KeyCondition partition_key_condition (
477516 filter_dag, getContext (), partition_pruning_columns_names_and_types.getNames (), partition_minmax_idx_expr);
478517
479- const auto & data_files_in_manifest = manifest_entry .manifest_file ->getFiles ();
518+ const auto & data_files_in_manifest = manifest_list_entry .manifest_file ->getFiles ();
480519 for (const auto & manifest_file_entry : data_files_in_manifest)
481520 {
482521 if (manifest_file_entry.status != ManifestEntryStatus::DELETED)
0 commit comments