2121
2222import org .apache .lucene .analysis .TokenFilter ;
2323import org .apache .lucene .analysis .TokenStream ;
24- import org .apache .lucene .analysis .tokenattributes . OffsetAttribute ;
24+ import org .apache .lucene .analysis .miscellaneous . LengthFilter ;
2525import org .apache .lucene .analysis .tokenattributes .CharTermAttribute ;
26+ import org .apache .lucene .analysis .tokenattributes .OffsetAttribute ;
27+ import org .apache .lucene .analysis .tokenattributes .PositionIncrementAttribute ;
28+ import org .apache .lucene .analysis .tokenattributes .PositionLengthAttribute ;
29+ import org .apache .lucene .util .Version ;
2630
2731/**
2832 * Tokenizes the input into n-grams of the given size(s).
33+ * <a name="version"/>
34+ * <p>You must specify the required {@link Version} compatibility when
35+ * creating a {@link NGramTokenFilter}. As of Lucene 4.4, this token filters:<ul>
36+ * <li>emits all n-grams for the same token at the same position,</li>
37+ * <li>does not modify offsets,</li>
38+ * <li>sorts n-grams by their offset in the original token first, then
39+ * increasing length (meaning that "abc" will give "a", "ab", "abc", "b", "bc",
40+ * "c").</li></ul>
41+ * <p>You can make this filter use the old behavior by providing a version <
42+ * {@link Version#LUCENE_44} in the constructor but this is not recommended as
43+ * it will lead to broken {@link TokenStream}s that will cause highlighting
44+ * bugs.
2945 */
3046public final class NGramTokenFilter extends TokenFilter {
3147 public static final int DEFAULT_MIN_NGRAM_SIZE = 1 ;
3248 public static final int DEFAULT_MAX_NGRAM_SIZE = 2 ;
3349
34- private int minGram , maxGram ;
35-
50+ private final int minGram , maxGram ;
51+
3652 private char [] curTermBuffer ;
3753 private int curTermLength ;
3854 private int curGramSize ;
3955 private int curPos ;
56+ private int curPosInc , curPosLen ;
4057 private int tokStart ;
41- private int tokEnd ; // only used if the length changed before this filter
58+ private int tokEnd ;
4259 private boolean hasIllegalOffsets ; // only if the length changed before this filter
43-
60+
61+ private final Version version ;
4462 private final CharTermAttribute termAtt = addAttribute (CharTermAttribute .class );
63+ private final PositionIncrementAttribute posIncAtt ;
64+ private final PositionLengthAttribute posLenAtt ;
4565 private final OffsetAttribute offsetAtt = addAttribute (OffsetAttribute .class );
4666
4767 /**
4868 * Creates NGramTokenFilter with given min and max n-grams.
69+ * @param version Lucene version to enable correct position increments.
70+ * See <a href="#version">above</a> for details.
4971 * @param input {@link TokenStream} holding the input to be tokenized
5072 * @param minGram the smallest n-gram to generate
5173 * @param maxGram the largest n-gram to generate
5274 */
53- public NGramTokenFilter (TokenStream input , int minGram , int maxGram ) {
54- super (input );
75+ public NGramTokenFilter (Version version , TokenStream input , int minGram , int maxGram ) {
76+ super (new LengthFilter (true , input , minGram , Integer .MAX_VALUE ));
77+ this .version = version ;
5578 if (minGram < 1 ) {
5679 throw new IllegalArgumentException ("minGram must be greater than zero" );
5780 }
@@ -60,14 +83,37 @@ public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
6083 }
6184 this .minGram = minGram ;
6285 this .maxGram = maxGram ;
86+ if (version .onOrAfter (Version .LUCENE_44 )) {
87+ posIncAtt = addAttribute (PositionIncrementAttribute .class );
88+ posLenAtt = addAttribute (PositionLengthAttribute .class );
89+ } else {
90+ posIncAtt = new PositionIncrementAttribute () {
91+ @ Override
92+ public void setPositionIncrement (int positionIncrement ) {}
93+ @ Override
94+ public int getPositionIncrement () {
95+ return 0 ;
96+ }
97+ };
98+ posLenAtt = new PositionLengthAttribute () {
99+ @ Override
100+ public void setPositionLength (int positionLength ) {}
101+ @ Override
102+ public int getPositionLength () {
103+ return 0 ;
104+ }
105+ };
106+ }
63107 }
64108
65109 /**
66110 * Creates NGramTokenFilter with default min and max n-grams.
111+ * @param version Lucene version to enable correct position increments.
112+ * See <a href="#version">above</a> for details.
67113 * @param input {@link TokenStream} holding the input to be tokenized
68114 */
69- public NGramTokenFilter (TokenStream input ) {
70- this (input , DEFAULT_MIN_NGRAM_SIZE , DEFAULT_MAX_NGRAM_SIZE );
115+ public NGramTokenFilter (Version version , TokenStream input ) {
116+ this (version , input , DEFAULT_MIN_NGRAM_SIZE , DEFAULT_MAX_NGRAM_SIZE );
71117 }
72118
73119 /** Returns the next token in the stream, or null at EOS. */
@@ -82,27 +128,46 @@ public final boolean incrementToken() throws IOException {
82128 curTermLength = termAtt .length ();
83129 curGramSize = minGram ;
84130 curPos = 0 ;
131+ curPosInc = posIncAtt .getPositionIncrement ();
132+ curPosLen = posLenAtt .getPositionLength ();
85133 tokStart = offsetAtt .startOffset ();
86134 tokEnd = offsetAtt .endOffset ();
87135 // if length by start + end offsets doesn't match the term text then assume
88136 // this is a synonym and don't adjust the offsets.
89137 hasIllegalOffsets = (tokStart + curTermLength ) != tokEnd ;
90138 }
91139 }
92- while (curGramSize <= maxGram ) {
93- while (curPos +curGramSize <= curTermLength ) { // while there is input
140+ if (version .onOrAfter (Version .LUCENE_44 )) {
141+ if (curGramSize > maxGram || curPos + curGramSize > curTermLength ) {
142+ ++curPos ;
143+ curGramSize = minGram ;
144+ }
145+ if (curPos + curGramSize <= curTermLength ) {
94146 clearAttributes ();
95147 termAtt .copyBuffer (curTermBuffer , curPos , curGramSize );
96- if (hasIllegalOffsets ) {
97- offsetAtt .setOffset (tokStart , tokEnd );
98- } else {
99- offsetAtt .setOffset (tokStart + curPos , tokStart + curPos + curGramSize );
100- }
101- curPos ++;
148+ posIncAtt .setPositionIncrement (curPosInc );
149+ curPosInc = 0 ;
150+ posLenAtt .setPositionLength (curPosLen );
151+ offsetAtt .setOffset (tokStart , tokEnd );
152+ curGramSize ++;
102153 return true ;
103154 }
104- curGramSize ++; // increase n-gram size
105- curPos = 0 ;
155+ } else {
156+ while (curGramSize <= maxGram ) {
157+ while (curPos +curGramSize <= curTermLength ) { // while there is input
158+ clearAttributes ();
159+ termAtt .copyBuffer (curTermBuffer , curPos , curGramSize );
160+ if (hasIllegalOffsets ) {
161+ offsetAtt .setOffset (tokStart , tokEnd );
162+ } else {
163+ offsetAtt .setOffset (tokStart + curPos , tokStart + curPos + curGramSize );
164+ }
165+ curPos ++;
166+ return true ;
167+ }
168+ curGramSize ++; // increase n-gram size
169+ curPos = 0 ;
170+ }
106171 }
107172 curTermBuffer = null ;
108173 }
0 commit comments