apache
diff --git a/‎lucene/CHANGES.txt‎
Lines changed: 13 additions & 0 deletions b/‎lucene/CHANGES.txt‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java‎
Lines changed: 155 additions & 0 deletions b/‎lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java‎
Lines changed: 155 additions & 0 deletions
diff --git a/‎lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java‎
Lines changed: 1 addition & 1 deletion b/‎lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java‎
Lines changed: 84 additions & 19 deletions b/‎lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java‎
Lines changed: 84 additions & 19 deletions
@@ -37,6 +37,16 @@ Optimizations
 
 ======================= Lucene 4.4.0 =======================
 
+Changes in backwards compatibility policy
+
+* LUCENE-4955: NGramTokenFilter now emits all n-grams for the same token at the
+  same position and preserves the position length and the offsets of the
+  original token. (Simon Willnauer, Adrien Grand)
+
+* LUCENE-4955: NGramTokenizer now emits n-grams in a different order
+  (a, ab, b, bc, c) instead of (a, b, c, ab, bc) and doesn't trim trailing
+  whitespaces. (Adrien Grand)
+
 Bug Fixes
 
 * LUCENE-4935: CustomScoreQuery wrongly applied its query boost twice 
@@ -46,6 +56,9 @@ Bug Fixes
   if you had a 64-bit JVM without compressed OOPS: IBM J9, or Oracle with
   large heap/explicitly disabled.  (Mike McCandless, Uwe Schindler, Robert Muir)
 
+* LUCENE-4955: NGramTokenizer now supports inputs larger than 1024 chars.
+  (Adrien Grand)
+
 Optimizations
 
 * LUCENE-4938: Don't use an unnecessarily large priority queue in IndexSearcher
 
@@ -0,0 +1,155 @@
+package org.apache.lucene.analysis.ngram;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+
+/**
+ * Old broken version of {@link NGramTokenizer}.
+ */
+@Deprecated
+public final class Lucene43NGramTokenizer extends Tokenizer {
+  public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
+  public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
+
+  private int minGram, maxGram;
+  private int gramSize;
+  private int pos;
+  private int inLen; // length of the input AFTER trim()
+  private int charsRead; // length of the input
+  private String inStr;
+  private boolean started;
+  
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+  /**
+   * Creates NGramTokenizer with given min and max n-grams.
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public Lucene43NGramTokenizer(Reader input, int minGram, int maxGram) {
+    super(input);
+    init(minGram, maxGram);
+  }
+
+  /**
+   * Creates NGramTokenizer with given min and max n-grams.
+   * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public Lucene43NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram) {
+    super(factory, input);
+    init(minGram, maxGram);
+  }
+
+  /**
+   * Creates NGramTokenizer with default min and max n-grams.
+   * @param input {@link Reader} holding the input to be tokenized
+   */
+  public Lucene43NGramTokenizer(Reader input) {
+    this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
+  }
+  
+  private void init(int minGram, int maxGram) {
+    if (minGram < 1) {
+      throw new IllegalArgumentException("minGram must be greater than zero");
+    }
+    if (minGram > maxGram) {
+      throw new IllegalArgumentException("minGram must not be greater than maxGram");
+    }
+    this.minGram = minGram;
+    this.maxGram = maxGram;
+  }
+
+  /** Returns the next token in the stream, or null at EOS. */
+  @Override
+  public boolean incrementToken() throws IOException {
+    clearAttributes();
+    if (!started) {
+      started = true;
+      gramSize = minGram;
+      char[] chars = new char[1024];
+      charsRead = 0;
+      // TODO: refactor to a shared readFully somewhere:
+      while (charsRead < chars.length) {
+        int inc = input.read(chars, charsRead, chars.length-charsRead);
+        if (inc == -1) {
+          break;
+        }
+        charsRead += inc;
+      }
+      inStr = new String(chars, 0, charsRead).trim();  // remove any trailing empty strings 
+
+      if (charsRead == chars.length) {
+        // Read extra throwaway chars so that on end() we
+        // report the correct offset:
+        char[] throwaway = new char[1024];
+        while(true) {
+          final int inc = input.read(throwaway, 0, throwaway.length);
+          if (inc == -1) {
+            break;
+          }
+          charsRead += inc;
+        }
+      }
+
+      inLen = inStr.length();
+      if (inLen == 0) {
+        return false;
+      }
+    }
+
+    if (pos+gramSize > inLen) {            // if we hit the end of the string
+      pos = 0;                           // reset to beginning of string
+      gramSize++;                        // increase n-gram size
+      if (gramSize > maxGram)            // we are done
+        return false;
+      if (pos+gramSize > inLen)
+        return false;
+    }
+
+    int oldPos = pos;
+    pos++;
+    termAtt.setEmpty().append(inStr, oldPos, oldPos+gramSize);
+    offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize));
+    return true;
+  }
+  
+  @Override
+  public void end() {
+    // set final offset
+    final int finalOffset = correctOffset(charsRead);
+    this.offsetAtt.setOffset(finalOffset, finalOffset);
+  }    
+  
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    started = false;
+    pos = 0;
+  }
+}
@@ -47,6 +47,6 @@ public NGramFilterFactory(Map<String, String> args) {
 
   @Override
   public NGramTokenFilter create(TokenStream input) {
-    return new NGramTokenFilter(input, minGramSize, maxGramSize);
+    return new NGramTokenFilter(luceneMatchVersion, input, minGramSize, maxGramSize);
   }
 }
@@ -21,37 +21,60 @@
 
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.miscellaneous.LengthFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.util.Version;
 
 /**
  * Tokenizes the input into n-grams of the given size(s).
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version} compatibility when
+ * creating a {@link NGramTokenFilter}. As of Lucene 4.4, this token filters:<ul>
+ * <li>emits all n-grams for the same token at the same position,</li>
+ * <li>does not modify offsets,</li>
+ * <li>sorts n-grams by their offset in the original token first, then
+ * increasing length (meaning that "abc" will give "a", "ab", "abc", "b", "bc",
+ * "c").</li></ul>
+ * <p>You can make this filter use the old behavior by providing a version &lt;
+ * {@link Version#LUCENE_44} in the constructor but this is not recommended as
+ * it will lead to broken {@link TokenStream}s that will cause highlighting
+ * bugs.
  */
 public final class NGramTokenFilter extends TokenFilter {
   public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
   public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
 
-  private int minGram, maxGram;
-  
+  private final int minGram, maxGram;
+
   private char[] curTermBuffer;
   private int curTermLength;
   private int curGramSize;
   private int curPos;
+  private int curPosInc, curPosLen;
   private int tokStart;
-  private int tokEnd; // only used if the length changed before this filter
+  private int tokEnd;
   private boolean hasIllegalOffsets; // only if the length changed before this filter
-  
+
+  private final Version version;
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final PositionIncrementAttribute posIncAtt;
+  private final PositionLengthAttribute posLenAtt;
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
 
   /**
    * Creates NGramTokenFilter with given min and max n-grams.
+   * @param version Lucene version to enable correct position increments.
+   *                See <a href="#version">above</a> for details.
    * @param input {@link TokenStream} holding the input to be tokenized
    * @param minGram the smallest n-gram to generate
    * @param maxGram the largest n-gram to generate
    */
-  public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
-    super(input);
+  public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
+    super(new LengthFilter(true, input, minGram, Integer.MAX_VALUE));
+    this.version = version;
     if (minGram < 1) {
       throw new IllegalArgumentException("minGram must be greater than zero");
     }
@@ -60,14 +83,37 @@ public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
     }
     this.minGram = minGram;
     this.maxGram = maxGram;
+    if (version.onOrAfter(Version.LUCENE_44)) {
+      posIncAtt = addAttribute(PositionIncrementAttribute.class);
+      posLenAtt = addAttribute(PositionLengthAttribute.class);
+    } else {
+      posIncAtt = new PositionIncrementAttribute() {
+        @Override
+        public void setPositionIncrement(int positionIncrement) {}
+        @Override
+        public int getPositionIncrement() {
+          return 0;
+        }
+      };
+      posLenAtt = new PositionLengthAttribute() {
+        @Override
+        public void setPositionLength(int positionLength) {}        
+        @Override
+        public int getPositionLength() {
+          return 0;
+        }
+      };
+    }
   }
 
   /**
    * Creates NGramTokenFilter with default min and max n-grams.
+   * @param version Lucene version to enable correct position increments.
+   *                See <a href="#version">above</a> for details.
    * @param input {@link TokenStream} holding the input to be tokenized
    */
-  public NGramTokenFilter(TokenStream input) {
-    this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
+  public NGramTokenFilter(Version version, TokenStream input) {
+    this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
   }
 
   /** Returns the next token in the stream, or null at EOS. */
@@ -82,27 +128,46 @@ public final boolean incrementToken() throws IOException {
           curTermLength = termAtt.length();
           curGramSize = minGram;
           curPos = 0;
+          curPosInc = posIncAtt.getPositionIncrement();
+          curPosLen = posLenAtt.getPositionLength();
           tokStart = offsetAtt.startOffset();
           tokEnd = offsetAtt.endOffset();
           // if length by start + end offsets doesn't match the term text then assume
           // this is a synonym and don't adjust the offsets.
           hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
         }
       }
-      while (curGramSize <= maxGram) {
-        while (curPos+curGramSize <= curTermLength) {     // while there is input
+      if (version.onOrAfter(Version.LUCENE_44)) {
+        if (curGramSize > maxGram || curPos + curGramSize > curTermLength) {
+          ++curPos;
+          curGramSize = minGram;
+        }
+        if (curPos + curGramSize <= curTermLength) {
           clearAttributes();
           termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
-          if (hasIllegalOffsets) {
-            offsetAtt.setOffset(tokStart, tokEnd);
-          } else {
-            offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize);
-          }
-          curPos++;
+          posIncAtt.setPositionIncrement(curPosInc);
+          curPosInc = 0;
+          posLenAtt.setPositionLength(curPosLen);
+          offsetAtt.setOffset(tokStart, tokEnd);
+          curGramSize++;
           return true;
         }
-        curGramSize++;                         // increase n-gram size
-        curPos = 0;
+      } else {
+        while (curGramSize <= maxGram) {
+          while (curPos+curGramSize <= curTermLength) {     // while there is input
+            clearAttributes();
+            termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
+            if (hasIllegalOffsets) {
+              offsetAtt.setOffset(tokStart, tokEnd);
+            } else {
+              offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize);
+            }
+            curPos++;
+            return true;
+          }
+          curGramSize++;                         // increase n-gram size
+          curPos = 0;
+        }
       }
       curTermBuffer = null;
     }
Original file line number	Diff line number	Diff line change
`@@ -47,6 +47,6 @@ public NGramFilterFactory(Map<String, String> args) {`
`47`	`47`
`48`	`48`	`@Override`
`49`	`49`	`public NGramTokenFilter create(TokenStream input) {`
`50`		`- return new NGramTokenFilter(input, minGramSize, maxGramSize);`
	`50`	`+ return new NGramTokenFilter(luceneMatchVersion, input, minGramSize, maxGramSize);`
`51`	`51`	`}`
`52`	`52`	`}`