Skip to content

Commit a5b3baa

Browse files
committed
Add case for no deleted docs
Signed-off-by: Sandesh Kumar <sandeshkr419@gmail.com>
1 parent ce1082c commit a5b3baa

1 file changed

Lines changed: 86 additions & 13 deletions

File tree

server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java

Lines changed: 86 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,11 @@
3535
import org.apache.lucene.index.DocValues;
3636
import org.apache.lucene.index.IndexReader;
3737
import org.apache.lucene.index.LeafReaderContext;
38+
import org.apache.lucene.index.NumericDocValues;
3839
import org.apache.lucene.index.SortedDocValues;
3940
import org.apache.lucene.index.SortedSetDocValues;
4041
import org.apache.lucene.index.Terms;
41-
import org.apache.lucene.search.MatchAllDocsQuery;
42+
import org.apache.lucene.index.TermsEnum;
4243
import org.apache.lucene.search.Weight;
4344
import org.apache.lucene.util.ArrayUtil;
4445
import org.apache.lucene.util.BytesRef;
@@ -49,6 +50,7 @@
4950
import org.opensearch.common.util.LongHash;
5051
import org.opensearch.core.common.io.stream.StreamOutput;
5152
import org.opensearch.core.xcontent.XContentBuilder;
53+
import org.opensearch.index.mapper.DocCountFieldMapper;
5254
import org.opensearch.search.DocValueFormat;
5355
import org.opensearch.search.aggregations.AggregationExecutionException;
5456
import org.opensearch.search.aggregations.Aggregator;
@@ -64,7 +66,6 @@
6466
import org.opensearch.search.aggregations.bucket.terms.SignificanceLookup.BackgroundFrequencyForBytes;
6567
import org.opensearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;
6668
import org.opensearch.search.aggregations.support.ValuesSource;
67-
import org.opensearch.search.aggregations.support.ValuesSource.Bytes.WithOrdinals;
6869
import org.opensearch.search.internal.SearchContext;
6970

7071
import java.io.IOException;
@@ -78,6 +79,7 @@
7879

7980
import static org.opensearch.search.aggregations.InternalOrder.isKeyOrder;
8081
import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
82+
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
8183

8284
/**
8385
* An aggregator of string values that relies on global ordinals in order to build buckets.
@@ -94,6 +96,8 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
9496
private final LongPredicate acceptedGlobalOrdinals;
9597
private final long valueCount;
9698

99+
private final String fieldName;
100+
97101
private Weight weight;
98102
private final GlobalOrdLookupFunction lookupGlobalOrd;
99103
protected final CollectionStrategy collectionStrategy;
@@ -146,6 +150,7 @@ public GlobalOrdinalsStringTermsAggregator(
146150
return new DenseGlobalOrds();
147151
});
148152
}
153+
this.fieldName = ((ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource).indexFieldData.getFieldName();
149154
}
150155

151156
String descriptCollectionStrategy() {
@@ -156,22 +161,80 @@ public void setWeight(Weight weight) {
156161
this.weight = weight;
157162
}
158163

159-
@Override
160-
public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCollector sub) throws IOException {
161-
if (weight != null && weight.getQuery() instanceof MatchAllDocsQuery) {
162-
if ((weight.count(ctx) == 0)
163-
&& Terms.getTerms(ctx.reader(), String.valueOf(((WithOrdinals.FieldData) valuesSource).indexFieldData.getFieldName()))
164-
.size() == 0) {
165-
return LeafBucketCollector.NO_OP_COLLECTOR;
166-
// } else if (weight.count(ctx) == ctx.reader().maxDoc() && weight.getQuery() instanceof MatchAllDocsQuery) {
167-
// no deleted documents & top level query matches everything
168-
// iterate over the terms - doc frequency for each termsEnum directly
169-
// return appropriate LeafCollector
164+
/**
165+
Collects term frequencies for a given field from a LeafReaderContext.
166+
@param ctx The LeafReaderContext to collect terms from
167+
@param ords The SortedSetDocValues for the field's ordinals
168+
@param ordCountConsumer A consumer to accept collected term frequencies
169+
@return A LeafBucketCollector implementation that throws an exception, since collection is complete
170+
@throws IOException If an I/O error occurs during reading */
171+
LeafBucketCollector termDocFreqCollector(LeafReaderContext ctx, SortedSetDocValues ords, BiConsumer<Long, Integer> ordCountConsumer)
172+
throws IOException {
173+
// long n0 = System.nanoTime(), n1, n2, n3, n4, n5 = 0;
174+
if (weight.count(ctx) != ctx.reader().maxDoc()) {
175+
// Top-level query does not match all docs in this segment.
176+
return null;
177+
}
178+
// n1 = System.nanoTime();
179+
180+
Terms aggTerms = ctx.reader().terms(this.fieldName);
181+
if (aggTerms == null) {
182+
// Field is not indexed.
183+
return null;
184+
}
185+
// n2 = System.nanoTime();
186+
NumericDocValues docCountValues = DocValues.getNumeric(ctx.reader(), DocCountFieldMapper.NAME);
187+
if (docCountValues.nextDoc() != NO_MORE_DOCS) {
188+
// This segment has at least one document with the _doc_count field.
189+
return null;
190+
}
191+
// n3 = System.nanoTime();
192+
TermsEnum indexTermsEnum = aggTerms.iterator();
193+
BytesRef indexTerm = indexTermsEnum.next();
194+
TermsEnum ordinalTermsEnum = ords.termsEnum();
195+
BytesRef ordinalTerm = ordinalTermsEnum.next();
196+
// n4 = System.nanoTime();
197+
while (indexTerm != null && ordinalTerm != null) {
198+
int compare = indexTerm.compareTo(ordinalTerm);
199+
if (compare == 0) {
200+
if (acceptedGlobalOrdinals.test(ordinalTermsEnum.ord())) {
201+
ordCountConsumer.accept(ordinalTermsEnum.ord(), indexTermsEnum.docFreq());
202+
}
203+
indexTerm = indexTermsEnum.next();
204+
ordinalTerm = ordinalTermsEnum.next();
205+
} else if (compare < 0) {
206+
indexTerm = indexTermsEnum.next();
207+
} else {
208+
ordinalTerm = ordinalTermsEnum.next();
170209
}
210+
// n5 = System.nanoTime();
171211
}
212+
// logger.info((n1 - n0) + " " + (n2 - n1) + " " + (n3 - n2) + " " + (n4 - n3) + " " + (n5 - n4));
213+
// return new LeafBucketCollector() {
214+
// @Override
215+
// public void collect(int doc, long owningBucketOrd) {
216+
// throw new CollectionTerminatedException();
217+
// }
218+
// };
219+
return LeafBucketCollector.NO_OP_COLLECTOR;
220+
}
172221

222+
@Override
223+
public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCollector sub) throws IOException {
173224
SortedSetDocValues globalOrds = valuesSource.globalOrdinalsValues(ctx);
174225
collectionStrategy.globalOrdsReady(globalOrds);
226+
227+
if (collectionStrategy instanceof DenseGlobalOrds && sub == LeafBucketCollector.NO_OP_COLLECTOR) {
228+
LeafBucketCollector termDocFreqCollector = termDocFreqCollector(
229+
ctx,
230+
globalOrds,
231+
(o, c) -> incrementBucketDocCount(collectionStrategy.globalOrdToBucketOrd(0, o), c)
232+
);
233+
if (termDocFreqCollector != null) {
234+
return termDocFreqCollector;
235+
}
236+
}
237+
175238
SortedDocValues singleValues = DocValues.unwrapSingleton(globalOrds);
176239
if (singleValues != null) {
177240
segmentsWithSingleValuedOrds++;
@@ -369,6 +432,16 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCol
369432
final SortedSetDocValues segmentOrds = valuesSource.ordinalsValues(ctx);
370433
segmentDocCounts = context.bigArrays().grow(segmentDocCounts, 1 + segmentOrds.getValueCount());
371434
assert sub == LeafBucketCollector.NO_OP_COLLECTOR;
435+
436+
LeafBucketCollector termDocFreqCollector = this.termDocFreqCollector(
437+
ctx,
438+
segmentOrds,
439+
(o, c) -> segmentDocCounts.increment(o + 1, c)
440+
);
441+
if (termDocFreqCollector != null) {
442+
return termDocFreqCollector;
443+
}
444+
372445
final SortedDocValues singleValues = DocValues.unwrapSingleton(segmentOrds);
373446
mapping = valuesSource.globalOrdinalsMapping(ctx);
374447
// Dense mode doesn't support include/exclude so we don't have to check it here.

0 commit comments

Comments
 (0)