Skip to content

Commit b5795db

Browse files
authored
LUCENE-10078: Enable merge-on-refresh by default. (#921)
This gives implementations of `findFullFlushMerges` to `LogMergePolicy` and `TieredMergePolicy` and enables merge-on-refresh with a default timeout of 500ms. The idea behind the 500ms default is that it felt both high-enough to have time to run merges of small segments, and low enough that the freshness of the data wouldn't look badly affected for users who have high refresh rates (e.g. refreshing every second). In both cases, `findFullFlushMerges` delegates to `findMerges` and filters merges whose segments are all below the min/floor size.
1 parent 7e9d5ab commit b5795db

15 files changed

Lines changed: 192 additions & 25 deletions

lucene/CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,9 @@ New Features
7676
Improvements
7777
---------------------
7878

79+
* LUCENE-10078: Merge on full flush is now enabled by default with a timeout of
80+
500ms. (Adrien Grand)
81+
7982
* LUCENE-10585: Facet module code cleanup (copy/paste scrubbing, simplification and some very minor
8083
optimization tweaks). (Greg Miller)
8184

lucene/core/src/java/org/apache/lucene/index/FilterMergePolicy.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,4 +124,9 @@ public int numDeletesToMerge(
124124
public MergePolicy unwrap() {
125125
return in;
126126
}
127+
128+
@Override
129+
protected long maxFullFlushMergeSize() {
130+
return in.maxFullFlushMergeSize();
131+
}
127132
}

lucene/core/src/java/org/apache/lucene/index/IndexWriter.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2372,6 +2372,10 @@ public synchronized Set<SegmentCommitInfo> getMergingSegments() {
23722372
* @lucene.experimental
23732373
*/
23742374
private synchronized MergePolicy.OneMerge getNextMerge() {
2375+
if (tragedy.get() != null) {
2376+
throw new IllegalStateException(
2377+
"this writer hit an unrecoverable error; cannot merge", tragedy.get());
2378+
}
23752379
if (pendingMerges.size() == 0) {
23762380
return null;
23772381
} else {
@@ -2388,6 +2392,10 @@ private synchronized MergePolicy.OneMerge getNextMerge() {
23882392
* @lucene.experimental
23892393
*/
23902394
public synchronized boolean hasPendingMerges() {
2395+
if (tragedy.get() != null) {
2396+
throw new IllegalStateException(
2397+
"this writer hit an unrecoverable error; cannot merge", tragedy.get());
2398+
}
23912399
return pendingMerges.size() != 0;
23922400
}
23932401

lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ public enum OpenMode {
105105
* Default value for time to wait for merges on commit or getReader (when using a {@link
106106
* MergePolicy} that implements {@link MergePolicy#findFullFlushMerges}).
107107
*/
108-
public static final long DEFAULT_MAX_FULL_FLUSH_MERGE_WAIT_MILLIS = 0;
108+
public static final long DEFAULT_MAX_FULL_FLUSH_MERGE_WAIT_MILLIS = 500;
109109

110110
// indicates whether this config instance is already attached to a writer.
111111
// not final so that it can be cloned properly.
@@ -457,9 +457,14 @@ public IndexWriterConfig setCommitOnClose(boolean commitOnClose) {
457457
* call, like natural segment merges. The default is <code>
458458
* {@value IndexWriterConfig#DEFAULT_MAX_FULL_FLUSH_MERGE_WAIT_MILLIS}</code>.
459459
*
460-
* <p>Note: This settings has no effect unless {@link
461-
* MergePolicy#findFullFlushMerges(MergeTrigger, SegmentInfos, MergePolicy.MergeContext)} has an
462-
* implementation that actually returns merges which by default doesn't return any merges.
460+
* <p>Note: Which segments would get merged depends on the implementation of {@link
461+
* MergePolicy#findFullFlushMerges(MergeTrigger, SegmentInfos, MergePolicy.MergeContext)}
462+
*
463+
* <p>Note: Set to 0 to disable merging on full flush.
464+
*
465+
* <p>Note: If {@link SerialMergeScheduler} is used and a non-zero timout is configured,
466+
* full-flush merges will always wait for the merge to finish without honoring the configured
467+
* timeout.
463468
*/
464469
public IndexWriterConfig setMaxFullFlushMergeWaitMillis(long maxFullFlushMergeWaitMillis) {
465470
this.maxFullFlushMergeWaitMillis = maxFullFlushMergeWaitMillis;

lucene/core/src/java/org/apache/lucene/index/LogByteSizeMergePolicy.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,9 +96,10 @@ public double getMaxMergeMBForForcedMerge() {
9696
}
9797

9898
/**
99-
* Sets the minimum size for the lowest level segments. Any segments below this size will be
100-
* merged more aggressively in order to avoid having a long tail of small segments. Large values
101-
* of this parameter increase the merging cost during indexing if you flush small segments.
99+
* Sets the minimum size for the lowest level segments. Any segments below this size are
100+
* candidates for full-flush merges and be merged more aggressively in order to avoid having a
101+
* long tail of small segments. Large values of this parameter increase the merging cost during
102+
* indexing if you flush small segments.
102103
*/
103104
public void setMinMergeMB(double mb) {
104105
minMergeSize = (long) (mb * 1024 * 1024);

lucene/core/src/java/org/apache/lucene/index/LogDocMergePolicy.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,10 @@ protected long size(SegmentCommitInfo info, MergeContext mergeContext) throws IO
4343
}
4444

4545
/**
46-
* Sets the minimum size for the lowest level segments. Any segments below this size will be
47-
* merged more aggressively in order to avoid having a long tail of small segments. Large values
48-
* of this parameter increase the merging cost during indexing if you flush small segments.
46+
* Sets the minimum size for the lowest level segments. Any segments below this size are
47+
* candidates for full-flush merges and merged more aggressively in order to avoid having a long
48+
* tail of small segments. Large values of this parameter increase the merging cost during
49+
* indexing if you flush small segments.
4950
*/
5051
public void setMinMergeDocs(int minMergeDocs) {
5152
minMergeSize = minMergeDocs;

lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@
3535
* specifies how a segment's size is determined. {@link LogDocMergePolicy} is one subclass that
3636
* measures size by document count in the segment. {@link LogByteSizeMergePolicy} is another
3737
* subclass that measures size as the total byte size of the file(s) for the segment.
38+
*
39+
* <p><b>NOTE</b>: This policy returns natural merges whose size is below the {@link #minMergeSize
40+
* minimum merge size} for {@link #findFullFlushMerges full-flush merges}.
3841
*/
3942
public abstract class LogMergePolicy extends MergePolicy {
4043

@@ -64,7 +67,10 @@ public abstract class LogMergePolicy extends MergePolicy {
6467
/** How many segments to merge at a time. */
6568
protected int mergeFactor = DEFAULT_MERGE_FACTOR;
6669

67-
/** Any segments whose size is smaller than this value will be merged more aggressively. */
70+
/**
71+
* Any segments whose size is smaller than this value will be candidates for full-flush merges and
72+
* merged more aggressively.
73+
*/
6874
protected long minMergeSize;
6975

7076
/** If the size of a segment exceeds this value then it will never be merged. */
@@ -178,6 +184,11 @@ protected boolean isMerged(
178184
&& (numToMerge != 1 || !segmentIsOriginal || isMerged(infos, mergeInfo, mergeContext));
179185
}
180186

187+
@Override
188+
protected long maxFullFlushMergeSize() {
189+
return minMergeSize;
190+
}
191+
181192
/**
182193
* Returns the merges necessary to merge the index, taking the max merge size or max merge docs
183194
* into consideration. This method attempts to respect the {@code maxNumSegments} parameter,

lucene/core/src/java/org/apache/lucene/index/MergePolicy.java

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -601,9 +601,9 @@ public abstract MergeSpecification findForcedDeletesMerges(
601601
SegmentInfos segmentInfos, MergeContext mergeContext) throws IOException;
602602

603603
/**
604-
* Identifies merges that we want to execute (synchronously) on commit. By default, this will do
605-
* no merging on commit. If you implement this method in your {@code MergePolicy} you must also
606-
* set a non-zero timeout using {@link IndexWriterConfig#setMaxFullFlushMergeWaitMillis}.
604+
* Identifies merges that we want to execute (synchronously) on commit. By default, this will
605+
* return {@link #findMerges natural merges} whose segments are all less than the {@link
606+
* #maxFullFlushMergeSize() max segment size for full flushes}.
607607
*
608608
* <p>Any merges returned here will make {@link IndexWriter#commit()}, {@link
609609
* IndexWriter#prepareCommit()} or {@link IndexWriter#getReader(boolean, boolean)} block until the
@@ -628,7 +628,28 @@ public abstract MergeSpecification findForcedDeletesMerges(
628628
public MergeSpecification findFullFlushMerges(
629629
MergeTrigger mergeTrigger, SegmentInfos segmentInfos, MergeContext mergeContext)
630630
throws IOException {
631-
return null;
631+
// This returns natural merges that contain segments below the minimum size
632+
MergeSpecification mergeSpec = findMerges(mergeTrigger, segmentInfos, mergeContext);
633+
if (mergeSpec == null) {
634+
return null;
635+
}
636+
MergeSpecification newMergeSpec = null;
637+
for (OneMerge oneMerge : mergeSpec.merges) {
638+
boolean belowMaxFullFlushSize = true;
639+
for (SegmentCommitInfo sci : oneMerge.segments) {
640+
if (size(sci, mergeContext) >= maxFullFlushMergeSize()) {
641+
belowMaxFullFlushSize = false;
642+
break;
643+
}
644+
}
645+
if (belowMaxFullFlushSize) {
646+
if (newMergeSpec == null) {
647+
newMergeSpec = new MergeSpecification();
648+
}
649+
newMergeSpec.add(oneMerge);
650+
}
651+
}
652+
return newMergeSpec;
632653
}
633654

634655
/**
@@ -671,6 +692,14 @@ protected long size(SegmentCommitInfo info, MergeContext mergeContext) throws IO
671692
return (info.info.maxDoc() <= 0 ? byteSize : (long) (byteSize * (1.0 - delRatio)));
672693
}
673694

695+
/**
696+
* Return the maximum size of segments to be included in full-flush merges by the default
697+
* implementation of {@link #findFullFlushMerges}.
698+
*/
699+
protected long maxFullFlushMergeSize() {
700+
return 0L;
701+
}
702+
674703
/** Asserts that the delCount for this SegmentCommitInfo is valid */
675704
protected final boolean assertDelCount(int delCount, SegmentCommitInfo info) {
676705
assert delCount >= 0 : "delCount must be positive: " + delCount;

lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@
6262
*
6363
* <p>findForcedDeletesMerges should never produce segments greater than maxSegmentSize.
6464
*
65+
* <p><b>NOTE</b>: This policy returns natural merges whose size is below the {@link
66+
* #setFloorSegmentMB(double) floor segment size} for {@link #findFullFlushMerges full-flush
67+
* merges}.
68+
*
6569
* @lucene.experimental
6670
*/
6771

@@ -168,9 +172,16 @@ public double getDeletesPctAllowed() {
168172
}
169173

170174
/**
171-
* Segments smaller than this are "rounded up" to this size, ie treated as equal (floor) size for
172-
* merge selection. This is to prevent frequent flushing of tiny segments from allowing a long
173-
* tail in the index. Default is 2 MB.
175+
* Segments smaller than this size are merged more aggressively:
176+
*
177+
* <ul>
178+
* <li>They are candidates for full-flush merges, in order to reduce the number of segments in
179+
* the index prior to opening a new point-in-time view of the index.
180+
* <li>For background merges, smaller segments are "rounded up" to this size.
181+
* </ul>
182+
*
183+
* In both cases, this helps prevent frequent flushing of tiny segments to create a long tail of
184+
* small segments in the index. Default is 2MB.
174185
*/
175186
public TieredMergePolicy setFloorSegmentMB(double v) {
176187
if (v <= 0.0) {
@@ -190,6 +201,11 @@ public double getFloorSegmentMB() {
190201
return floorSegmentBytes / (1024 * 1024.);
191202
}
192203

204+
@Override
205+
protected long maxFullFlushMergeSize() {
206+
return floorSegmentBytes;
207+
}
208+
193209
/**
194210
* When forceMergeDeletes is called, we only merge away a segment if its delete percentage is over
195211
* this threshold. Default is 10%.

lucene/core/src/test/org/apache/lucene/index/TestDirectoryReaderReopen.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -815,7 +815,8 @@ public void testNPEAfterInvalidReindex2() throws Exception {
815815
/** test reopening backwards from a non-NRT reader (with document deletes) */
816816
public void testNRTMdeletes() throws Exception {
817817
Directory dir = newDirectory();
818-
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
818+
IndexWriterConfig iwc =
819+
new IndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(NoMergePolicy.INSTANCE);
819820
SnapshotDeletionPolicy snapshotter =
820821
new SnapshotDeletionPolicy(new KeepOnlyLastCommitDeletionPolicy());
821822
iwc.setIndexDeletionPolicy(snapshotter);
@@ -865,7 +866,8 @@ public void testNRTMdeletes() throws Exception {
865866
/** test reopening backwards from an NRT reader (with document deletes) */
866867
public void testNRTMdeletes2() throws Exception {
867868
Directory dir = newDirectory();
868-
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
869+
IndexWriterConfig iwc =
870+
new IndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(NoMergePolicy.INSTANCE);
869871
SnapshotDeletionPolicy snapshotter =
870872
new SnapshotDeletionPolicy(new KeepOnlyLastCommitDeletionPolicy());
871873
iwc.setIndexDeletionPolicy(snapshotter);

0 commit comments

Comments
 (0)