Skip to content

Commit 2df6ea2

Browse files
gpaimlamocobeta
authored andcommitted
LUCENE-8891: Add snowball stemmer and analyzer for Estonian language.
Signed-off-by: Tomoko Uchida <tomoko@apache.org>
1 parent a9e37a6 commit 2df6ea2

6 files changed

Lines changed: 3720 additions & 0 deletions

File tree

lucene/CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ API Changes
2121

2222
New Features
2323

24+
* LUCENE-8891: Snowball stemmer/analyzer for the Estonian language.
25+
(Gert Morten Paimla via Tomoko Uchida)
26+
2427
* LUCENE-8815: Provide a DoubleValues implementation for retrieving the value of features without
2528
requiring a separate numeric field. Note that as feature values are stored with only 8 bits of
2629
mantissa the values returned may have a delta from the original values indexed.
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.lucene.analysis.et;
18+
19+
20+
import java.io.IOException;
21+
import java.io.Reader;
22+
23+
import org.apache.lucene.analysis.Analyzer;
24+
import org.apache.lucene.analysis.CharArraySet;
25+
import org.apache.lucene.analysis.LowerCaseFilter;
26+
import org.apache.lucene.analysis.StopFilter;
27+
import org.apache.lucene.analysis.StopwordAnalyzerBase;
28+
import org.apache.lucene.analysis.TokenStream;
29+
import org.apache.lucene.analysis.Tokenizer;
30+
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
31+
import org.apache.lucene.analysis.snowball.SnowballFilter;
32+
import org.apache.lucene.analysis.standard.StandardTokenizer;
33+
import org.tartarus.snowball.ext.EstonianStemmer;
34+
35+
/**
36+
* {@link Analyzer} for Estonian.
37+
*/
38+
public final class EstonianAnalyzer extends StopwordAnalyzerBase {
39+
private final CharArraySet stemExclusionSet;
40+
41+
/** File containing default Estonian stopwords. */
42+
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
43+
44+
/**
45+
* Returns an unmodifiable instance of the default stop words set.
46+
* @return default stop words set.
47+
*/
48+
public static CharArraySet getDefaultStopSet(){
49+
return DefaultSetHolder.DEFAULT_STOP_SET;
50+
}
51+
52+
/**
53+
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
54+
* accesses the static final set the first time.;
55+
*/
56+
private static class DefaultSetHolder {
57+
static final CharArraySet DEFAULT_STOP_SET;
58+
59+
static {
60+
try {
61+
DEFAULT_STOP_SET = loadStopwordSet(false,
62+
EstonianAnalyzer.class, DEFAULT_STOPWORD_FILE, "#");
63+
} catch (IOException ex) {
64+
// default set should always be present as it is part of the
65+
// distribution (JAR)
66+
throw new RuntimeException("Unable to load default stopword set");
67+
}
68+
}
69+
}
70+
71+
/**
72+
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
73+
*/
74+
public EstonianAnalyzer() {
75+
this(DefaultSetHolder.DEFAULT_STOP_SET);
76+
}
77+
78+
/**
79+
* Builds an analyzer with the given stop words.
80+
*
81+
* @param stopwords a stopword set
82+
*/
83+
public EstonianAnalyzer(CharArraySet stopwords) {
84+
this(stopwords, CharArraySet.EMPTY_SET);
85+
}
86+
87+
/**
88+
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
89+
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
90+
* stemming.
91+
*
92+
* @param stopwords a stopword set
93+
* @param stemExclusionSet a set of terms not to be stemmed
94+
*/
95+
public EstonianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
96+
super(stopwords);
97+
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
98+
}
99+
100+
/**
101+
* Creates a
102+
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
103+
* which tokenizes all the text in the provided {@link Reader}.
104+
*
105+
* @return A
106+
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
107+
* built from an {@link StandardTokenizer} filtered with
108+
* {@link LowerCaseFilter}, {@link StopFilter}
109+
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
110+
* provided and {@link SnowballFilter}.
111+
*/
112+
@Override
113+
protected TokenStreamComponents createComponents(String fieldName) {
114+
final Tokenizer source = new StandardTokenizer();
115+
TokenStream result = new LowerCaseFilter(source);
116+
result = new StopFilter(result, stopwords);
117+
if(!stemExclusionSet.isEmpty())
118+
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
119+
result = new SnowballFilter(result, new EstonianStemmer());
120+
return new TokenStreamComponents(source, result);
121+
}
122+
123+
@Override
124+
protected TokenStream normalize(String fieldName, TokenStream in) {
125+
return new LowerCaseFilter(in);
126+
}
127+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
/**
19+
* Analyzer for Estonian.
20+
*/
21+
package org.apache.lucene.analysis.et;

0 commit comments

Comments
 (0)