Skip to content

Commit a03e38d

Browse files
committed
LUCENE-4955: Fix NGramTokenizer and NGramTokenFilter, and remove them from TestRandomChains' exclusion list.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1476135 13f79535-47bb-0310-9956-ffa450edef68
1 parent 7cfcb26 commit a03e38d

10 files changed

Lines changed: 465 additions & 135 deletions

File tree

lucene/CHANGES.txt

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,16 @@ Optimizations
3737

3838
======================= Lucene 4.4.0 =======================
3939

40+
Changes in backwards compatibility policy
41+
42+
* LUCENE-4955: NGramTokenFilter now emits all n-grams for the same token at the
43+
same position and preserves the position length and the offsets of the
44+
original token. (Simon Willnauer, Adrien Grand)
45+
46+
* LUCENE-4955: NGramTokenizer now emits n-grams in a different order
47+
(a, ab, b, bc, c) instead of (a, b, c, ab, bc) and doesn't trim trailing
48+
whitespaces. (Adrien Grand)
49+
4050
Bug Fixes
4151

4252
* LUCENE-4935: CustomScoreQuery wrongly applied its query boost twice
@@ -46,6 +56,9 @@ Bug Fixes
4656
if you had a 64-bit JVM without compressed OOPS: IBM J9, or Oracle with
4757
large heap/explicitly disabled. (Mike McCandless, Uwe Schindler, Robert Muir)
4858

59+
* LUCENE-4955: NGramTokenizer now supports inputs larger than 1024 chars.
60+
(Adrien Grand)
61+
4962
Optimizations
5063

5164
* LUCENE-4938: Don't use an unnecessarily large priority queue in IndexSearcher
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
package org.apache.lucene.analysis.ngram;
2+
3+
/*
4+
* Licensed to the Apache Software Foundation (ASF) under one or more
5+
* contributor license agreements. See the NOTICE file distributed with
6+
* this work for additional information regarding copyright ownership.
7+
* The ASF licenses this file to You under the Apache License, Version 2.0
8+
* (the "License"); you may not use this file except in compliance with
9+
* the License. You may obtain a copy of the License at
10+
*
11+
* http://www.apache.org/licenses/LICENSE-2.0
12+
*
13+
* Unless required by applicable law or agreed to in writing, software
14+
* distributed under the License is distributed on an "AS IS" BASIS,
15+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
* See the License for the specific language governing permissions and
17+
* limitations under the License.
18+
*/
19+
20+
import java.io.IOException;
21+
import java.io.Reader;
22+
23+
import org.apache.lucene.analysis.Tokenizer;
24+
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
25+
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
26+
27+
/**
28+
* Old broken version of {@link NGramTokenizer}.
29+
*/
30+
@Deprecated
31+
public final class Lucene43NGramTokenizer extends Tokenizer {
32+
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
33+
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
34+
35+
private int minGram, maxGram;
36+
private int gramSize;
37+
private int pos;
38+
private int inLen; // length of the input AFTER trim()
39+
private int charsRead; // length of the input
40+
private String inStr;
41+
private boolean started;
42+
43+
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
44+
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
45+
46+
/**
47+
* Creates NGramTokenizer with given min and max n-grams.
48+
* @param input {@link Reader} holding the input to be tokenized
49+
* @param minGram the smallest n-gram to generate
50+
* @param maxGram the largest n-gram to generate
51+
*/
52+
public Lucene43NGramTokenizer(Reader input, int minGram, int maxGram) {
53+
super(input);
54+
init(minGram, maxGram);
55+
}
56+
57+
/**
58+
* Creates NGramTokenizer with given min and max n-grams.
59+
* @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
60+
* @param input {@link Reader} holding the input to be tokenized
61+
* @param minGram the smallest n-gram to generate
62+
* @param maxGram the largest n-gram to generate
63+
*/
64+
public Lucene43NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram) {
65+
super(factory, input);
66+
init(minGram, maxGram);
67+
}
68+
69+
/**
70+
* Creates NGramTokenizer with default min and max n-grams.
71+
* @param input {@link Reader} holding the input to be tokenized
72+
*/
73+
public Lucene43NGramTokenizer(Reader input) {
74+
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
75+
}
76+
77+
private void init(int minGram, int maxGram) {
78+
if (minGram < 1) {
79+
throw new IllegalArgumentException("minGram must be greater than zero");
80+
}
81+
if (minGram > maxGram) {
82+
throw new IllegalArgumentException("minGram must not be greater than maxGram");
83+
}
84+
this.minGram = minGram;
85+
this.maxGram = maxGram;
86+
}
87+
88+
/** Returns the next token in the stream, or null at EOS. */
89+
@Override
90+
public boolean incrementToken() throws IOException {
91+
clearAttributes();
92+
if (!started) {
93+
started = true;
94+
gramSize = minGram;
95+
char[] chars = new char[1024];
96+
charsRead = 0;
97+
// TODO: refactor to a shared readFully somewhere:
98+
while (charsRead < chars.length) {
99+
int inc = input.read(chars, charsRead, chars.length-charsRead);
100+
if (inc == -1) {
101+
break;
102+
}
103+
charsRead += inc;
104+
}
105+
inStr = new String(chars, 0, charsRead).trim(); // remove any trailing empty strings
106+
107+
if (charsRead == chars.length) {
108+
// Read extra throwaway chars so that on end() we
109+
// report the correct offset:
110+
char[] throwaway = new char[1024];
111+
while(true) {
112+
final int inc = input.read(throwaway, 0, throwaway.length);
113+
if (inc == -1) {
114+
break;
115+
}
116+
charsRead += inc;
117+
}
118+
}
119+
120+
inLen = inStr.length();
121+
if (inLen == 0) {
122+
return false;
123+
}
124+
}
125+
126+
if (pos+gramSize > inLen) { // if we hit the end of the string
127+
pos = 0; // reset to beginning of string
128+
gramSize++; // increase n-gram size
129+
if (gramSize > maxGram) // we are done
130+
return false;
131+
if (pos+gramSize > inLen)
132+
return false;
133+
}
134+
135+
int oldPos = pos;
136+
pos++;
137+
termAtt.setEmpty().append(inStr, oldPos, oldPos+gramSize);
138+
offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize));
139+
return true;
140+
}
141+
142+
@Override
143+
public void end() {
144+
// set final offset
145+
final int finalOffset = correctOffset(charsRead);
146+
this.offsetAtt.setOffset(finalOffset, finalOffset);
147+
}
148+
149+
@Override
150+
public void reset() throws IOException {
151+
super.reset();
152+
started = false;
153+
pos = 0;
154+
}
155+
}

lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,6 @@ public NGramFilterFactory(Map<String, String> args) {
4747

4848
@Override
4949
public NGramTokenFilter create(TokenStream input) {
50-
return new NGramTokenFilter(input, minGramSize, maxGramSize);
50+
return new NGramTokenFilter(luceneMatchVersion, input, minGramSize, maxGramSize);
5151
}
5252
}

lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java

Lines changed: 84 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -21,37 +21,60 @@
2121

2222
import org.apache.lucene.analysis.TokenFilter;
2323
import org.apache.lucene.analysis.TokenStream;
24-
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
24+
import org.apache.lucene.analysis.miscellaneous.LengthFilter;
2525
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
26+
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
27+
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
28+
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
29+
import org.apache.lucene.util.Version;
2630

2731
/**
2832
* Tokenizes the input into n-grams of the given size(s).
33+
* <a name="version"/>
34+
* <p>You must specify the required {@link Version} compatibility when
35+
* creating a {@link NGramTokenFilter}. As of Lucene 4.4, this token filters:<ul>
36+
* <li>emits all n-grams for the same token at the same position,</li>
37+
* <li>does not modify offsets,</li>
38+
* <li>sorts n-grams by their offset in the original token first, then
39+
* increasing length (meaning that "abc" will give "a", "ab", "abc", "b", "bc",
40+
* "c").</li></ul>
41+
* <p>You can make this filter use the old behavior by providing a version &lt;
42+
* {@link Version#LUCENE_44} in the constructor but this is not recommended as
43+
* it will lead to broken {@link TokenStream}s that will cause highlighting
44+
* bugs.
2945
*/
3046
public final class NGramTokenFilter extends TokenFilter {
3147
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
3248
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
3349

34-
private int minGram, maxGram;
35-
50+
private final int minGram, maxGram;
51+
3652
private char[] curTermBuffer;
3753
private int curTermLength;
3854
private int curGramSize;
3955
private int curPos;
56+
private int curPosInc, curPosLen;
4057
private int tokStart;
41-
private int tokEnd; // only used if the length changed before this filter
58+
private int tokEnd;
4259
private boolean hasIllegalOffsets; // only if the length changed before this filter
43-
60+
61+
private final Version version;
4462
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
63+
private final PositionIncrementAttribute posIncAtt;
64+
private final PositionLengthAttribute posLenAtt;
4565
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
4666

4767
/**
4868
* Creates NGramTokenFilter with given min and max n-grams.
69+
* @param version Lucene version to enable correct position increments.
70+
* See <a href="#version">above</a> for details.
4971
* @param input {@link TokenStream} holding the input to be tokenized
5072
* @param minGram the smallest n-gram to generate
5173
* @param maxGram the largest n-gram to generate
5274
*/
53-
public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
54-
super(input);
75+
public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
76+
super(new LengthFilter(true, input, minGram, Integer.MAX_VALUE));
77+
this.version = version;
5578
if (minGram < 1) {
5679
throw new IllegalArgumentException("minGram must be greater than zero");
5780
}
@@ -60,14 +83,37 @@ public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
6083
}
6184
this.minGram = minGram;
6285
this.maxGram = maxGram;
86+
if (version.onOrAfter(Version.LUCENE_44)) {
87+
posIncAtt = addAttribute(PositionIncrementAttribute.class);
88+
posLenAtt = addAttribute(PositionLengthAttribute.class);
89+
} else {
90+
posIncAtt = new PositionIncrementAttribute() {
91+
@Override
92+
public void setPositionIncrement(int positionIncrement) {}
93+
@Override
94+
public int getPositionIncrement() {
95+
return 0;
96+
}
97+
};
98+
posLenAtt = new PositionLengthAttribute() {
99+
@Override
100+
public void setPositionLength(int positionLength) {}
101+
@Override
102+
public int getPositionLength() {
103+
return 0;
104+
}
105+
};
106+
}
63107
}
64108

65109
/**
66110
* Creates NGramTokenFilter with default min and max n-grams.
111+
* @param version Lucene version to enable correct position increments.
112+
* See <a href="#version">above</a> for details.
67113
* @param input {@link TokenStream} holding the input to be tokenized
68114
*/
69-
public NGramTokenFilter(TokenStream input) {
70-
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
115+
public NGramTokenFilter(Version version, TokenStream input) {
116+
this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
71117
}
72118

73119
/** Returns the next token in the stream, or null at EOS. */
@@ -82,27 +128,46 @@ public final boolean incrementToken() throws IOException {
82128
curTermLength = termAtt.length();
83129
curGramSize = minGram;
84130
curPos = 0;
131+
curPosInc = posIncAtt.getPositionIncrement();
132+
curPosLen = posLenAtt.getPositionLength();
85133
tokStart = offsetAtt.startOffset();
86134
tokEnd = offsetAtt.endOffset();
87135
// if length by start + end offsets doesn't match the term text then assume
88136
// this is a synonym and don't adjust the offsets.
89137
hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
90138
}
91139
}
92-
while (curGramSize <= maxGram) {
93-
while (curPos+curGramSize <= curTermLength) { // while there is input
140+
if (version.onOrAfter(Version.LUCENE_44)) {
141+
if (curGramSize > maxGram || curPos + curGramSize > curTermLength) {
142+
++curPos;
143+
curGramSize = minGram;
144+
}
145+
if (curPos + curGramSize <= curTermLength) {
94146
clearAttributes();
95147
termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
96-
if (hasIllegalOffsets) {
97-
offsetAtt.setOffset(tokStart, tokEnd);
98-
} else {
99-
offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize);
100-
}
101-
curPos++;
148+
posIncAtt.setPositionIncrement(curPosInc);
149+
curPosInc = 0;
150+
posLenAtt.setPositionLength(curPosLen);
151+
offsetAtt.setOffset(tokStart, tokEnd);
152+
curGramSize++;
102153
return true;
103154
}
104-
curGramSize++; // increase n-gram size
105-
curPos = 0;
155+
} else {
156+
while (curGramSize <= maxGram) {
157+
while (curPos+curGramSize <= curTermLength) { // while there is input
158+
clearAttributes();
159+
termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
160+
if (hasIllegalOffsets) {
161+
offsetAtt.setOffset(tokStart, tokEnd);
162+
} else {
163+
offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize);
164+
}
165+
curPos++;
166+
return true;
167+
}
168+
curGramSize++; // increase n-gram size
169+
curPos = 0;
170+
}
106171
}
107172
curTermBuffer = null;
108173
}

0 commit comments

Comments
 (0)