@@ -596,7 +596,13 @@ public void cloneShardSnapshot(
596596 existingSnapshots = tuple .v1 ();
597597 } else {
598598 newGen = ShardGeneration .newGeneration ();
599- existingSnapshots = buildBlobStoreIndexShardSnapshots (index , Collections .emptySet (), shardContainer , shardGeneration ).v1 ();
599+ existingSnapshots = buildBlobStoreIndexShardSnapshots (
600+ index ,
601+ shardNum ,
602+ Collections .emptySet (),
603+ shardContainer ,
604+ shardGeneration
605+ ).v1 ();
600606 existingShardGen = shardGeneration ;
601607 }
602608 SnapshotFiles existingTargetFiles = null ;
@@ -1309,6 +1315,7 @@ protected void doRun() throws Exception {
13091315 newGen = -1L ;
13101316 blobStoreIndexShardSnapshots = buildBlobStoreIndexShardSnapshots (
13111317 indexId ,
1318+ shardId ,
13121319 originalShardBlobs ,
13131320 shardContainer ,
13141321 originalRepositoryData .shardGenerations ().getShardGen (indexId , shardId )
@@ -3203,6 +3210,7 @@ private void doSnapshotShard(SnapshotShardContext context) {
32033210 snapshotStatus .ensureNotAborted ();
32043211 Tuple <BlobStoreIndexShardSnapshots , ShardGeneration > tuple = buildBlobStoreIndexShardSnapshots (
32053212 context .indexId (),
3213+ shardId .id (),
32063214 blobs ,
32073215 shardContainer ,
32083216 generation
@@ -3848,14 +3856,15 @@ public BlobStoreIndexShardSnapshots getBlobStoreIndexShardSnapshots(IndexId inde
38483856 blobs = shardContainer .listBlobsByPrefix (OperationPurpose .SNAPSHOT_METADATA , SNAPSHOT_INDEX_PREFIX ).keySet ();
38493857 }
38503858
3851- return buildBlobStoreIndexShardSnapshots (indexId , blobs , shardContainer , shardGen ).v1 ();
3859+ return buildBlobStoreIndexShardSnapshots (indexId , shardId , blobs , shardContainer , shardGen ).v1 ();
38523860 }
38533861
38543862 /**
38553863 * Loads all available snapshots in the repository using the given {@code generation} or falling back to trying to determine it from
38563864 * the given list of blobs in the shard container.
38573865 *
38583866 * @param indexId {@link IndexId} identifying the corresponding index
3867+ * @param shardId The 0-based shard id, see also {@link ShardId#id()}
38593868 * @param blobs list of blobs in repository
38603869 * @param generation shard generation or {@code null} in case there was no shard generation tracked in the {@link RepositoryData} for
38613870 * this shard because its snapshot was created in a version older than
@@ -3864,6 +3873,7 @@ public BlobStoreIndexShardSnapshots getBlobStoreIndexShardSnapshots(IndexId inde
38643873 */
38653874 private Tuple <BlobStoreIndexShardSnapshots , ShardGeneration > buildBlobStoreIndexShardSnapshots (
38663875 IndexId indexId ,
3876+ int shardId ,
38673877 Set <String > blobs ,
38683878 BlobContainer shardContainer ,
38693879 @ Nullable ShardGeneration generation
@@ -3883,6 +3893,21 @@ private Tuple<BlobStoreIndexShardSnapshots, ShardGeneration> buildBlobStoreIndex
38833893 generation
38843894 );
38853895 } catch (NoSuchFileException noSuchFileException ) {
3896+ // Master may have concurrently mutated the shard generation. This can happen when master fails over
3897+ // which is "expected". We do not need to apply the following workaround for missing file in this case.
3898+ final RepositoryData currentRepositoryData ;
3899+ try {
3900+ final long latestGeneration = latestIndexBlobId ();
3901+ currentRepositoryData = getRepositoryData (latestGeneration );
3902+ } catch (Exception e ) {
3903+ noSuchFileException .addSuppressed (e );
3904+ throw noSuchFileException ;
3905+ }
3906+ final ShardGeneration latestShardGen = currentRepositoryData .shardGenerations ().getShardGen (indexId , shardId );
3907+ if (latestShardGen == null || latestShardGen .equals (generation ) == false ) {
3908+ throw noSuchFileException ;
3909+ }
3910+
38863911 // This shouldn't happen (absent an external force deleting blobs from the repo) but in practice we've found bugs in the way
38873912 // we manipulate shard generation UUIDs under concurrent snapshot load which can lead to incorrectly deleting a referenced
38883913 // shard-level `index-UUID` blob during finalization. We definitely want to treat this as a test failure (see the `assert`
0 commit comments