Skip to content

Commit 38b698d

Browse files
authored
[Analysis] Deprecate Standard Html Strip Analyzer in master (#26719)
* [Analysis] Deprecate Standard Html Strip Analyzer Deprecate only Standard Html Strip Analyzer If user create index with the analyzer since 7.0, es throws an exception. If an index was created before 7.0, es issue deprecation log We will remove it in 8.0 Related #4704
1 parent ec32e66 commit 38b698d

7 files changed

Lines changed: 90 additions & 4 deletions

File tree

docs/reference/migration/migrate_7_0/analysis.asciidoc

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,11 @@ instead.
3131
==== `standard` filter has been removed
3232

3333
The `standard` token filter has been removed because it doesn't change anything in the stream.
34+
35+
[float]
36+
==== Deprecated standard_html_strip analyzer
37+
38+
The `standard_html_strip` analyzer has been deprecated, and should be replaced
39+
with a combination of the `standard` tokenizer and `html_strip` char_filter.
40+
Indexes created using this analyzer will still be readable in elasticsearch 7.0,
41+
but it will not be possible to create new indexes using it.

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,8 @@ public List<ScriptContext<?>> getContexts() {
171171
public Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAnalyzers() {
172172
Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> analyzers = new TreeMap<>();
173173
analyzers.put("fingerprint", FingerprintAnalyzerProvider::new);
174+
175+
// TODO remove in 8.0
174176
analyzers.put("standard_html_strip", StandardHtmlStripAnalyzerProvider::new);
175177
analyzers.put("pattern", PatternAnalyzerProvider::new);
176178
analyzers.put("snowball", SnowballAnalyzerProvider::new);
@@ -320,6 +322,7 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
320322
@Override
321323
public List<PreBuiltAnalyzerProviderFactory> getPreBuiltAnalyzerProviderFactories() {
322324
List<PreBuiltAnalyzerProviderFactory> analyzers = new ArrayList<>();
325+
// TODO remove in 8.0
323326
analyzers.add(new PreBuiltAnalyzerProviderFactory("standard_html_strip", CachingStrategy.ELASTICSEARCH,
324327
() -> new StandardHtmlStripAnalyzer(CharArraySet.EMPTY_SET)));
325328
analyzers.add(new PreBuiltAnalyzerProviderFactory("pattern", CachingStrategy.ELASTICSEARCH,

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StandardHtmlStripAnalyzer.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,10 @@ public class StandardHtmlStripAnalyzer extends StopwordAnalyzerBase {
3737
public StandardHtmlStripAnalyzer() {
3838
super(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
3939
}
40-
40+
/**
41+
* @deprecated in 6.5, can not create in 7.0, and we remove this in 8.0
42+
*/
43+
@Deprecated
4144
StandardHtmlStripAnalyzer(CharArraySet stopwords) {
4245
super(stopwords);
4346
}

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StandardHtmlStripAnalyzerProvider.java

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,10 @@
1919

2020
package org.elasticsearch.analysis.common;
2121

22+
import org.apache.logging.log4j.LogManager;
2223
import org.apache.lucene.analysis.CharArraySet;
24+
import org.elasticsearch.Version;
25+
import org.elasticsearch.common.logging.DeprecationLogger;
2326
import org.elasticsearch.common.settings.Settings;
2427
import org.elasticsearch.env.Environment;
2528
import org.elasticsearch.index.IndexSettings;
@@ -28,14 +31,29 @@
2831

2932
public class StandardHtmlStripAnalyzerProvider extends AbstractIndexAnalyzerProvider<StandardHtmlStripAnalyzer> {
3033

34+
private static final DeprecationLogger DEPRECATION_LOGGER =
35+
new DeprecationLogger(LogManager.getLogger(StandardHtmlStripAnalyzerProvider.class));
36+
3137
private final StandardHtmlStripAnalyzer analyzer;
3238

39+
/**
40+
* @deprecated in 6.5, can not create in 7.0, and we remove this in 8.0
41+
*/
42+
@Deprecated
3343
StandardHtmlStripAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
3444
super(indexSettings, name, settings);
3545
final CharArraySet defaultStopwords = CharArraySet.EMPTY_SET;
3646
CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);
3747
analyzer = new StandardHtmlStripAnalyzer(stopWords);
3848
analyzer.setVersion(version);
49+
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) {
50+
throw new IllegalArgumentException("[standard_html_strip] analyzer is not supported for new indices, " +
51+
"use a custom analyzer using [standard] tokenizer and [html_strip] char_filter, plus [lowercase] filter");
52+
} else {
53+
DEPRECATION_LOGGER.deprecatedAndMaybeLog("standard_html_strip_deprecation",
54+
"Deprecated analyzer [standard_html_strip] used, " +
55+
"replace it with a custom analyzer using [standard] tokenizer and [html_strip] char_filter, plus [lowercase] filter");
56+
}
3957
}
4058

4159
@Override

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisPluginTests.java

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,16 @@
1919

2020
package org.elasticsearch.analysis.common;
2121

22+
import org.apache.lucene.analysis.Analyzer;
2223
import org.apache.lucene.analysis.MockTokenizer;
2324
import org.apache.lucene.analysis.Tokenizer;
2425
import org.elasticsearch.Version;
2526
import org.elasticsearch.cluster.metadata.IndexMetaData;
2627
import org.elasticsearch.common.settings.Settings;
2728
import org.elasticsearch.env.Environment;
2829
import org.elasticsearch.index.IndexSettings;
30+
import org.elasticsearch.index.analysis.IndexAnalyzers;
31+
import org.elasticsearch.index.analysis.NamedAnalyzer;
2932
import org.elasticsearch.index.analysis.TokenFilterFactory;
3033
import org.elasticsearch.test.ESTestCase;
3134
import org.elasticsearch.test.IndexSettingsModule;
@@ -116,4 +119,47 @@ public void testEdgeNGramNoDeprecationWarningPre6_4() throws IOException {
116119
assertNotNull(tokenFilterFactory.create(tokenizer));
117120
}
118121
}
122+
123+
124+
/**
125+
* Check that the deprecated analyzer name "standard_html_strip" throws exception for indices created since 7.0.0
126+
*/
127+
public void testStandardHtmlStripAnalyzerDeprecationError() throws IOException {
128+
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
129+
.put(IndexMetaData.SETTING_VERSION_CREATED,
130+
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.CURRENT))
131+
.put("index.analysis.analyzer.custom_analyzer.type", "standard_html_strip")
132+
.putList("index.analysis.analyzer.custom_analyzer.stopwords", "a", "b")
133+
.build();
134+
135+
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
136+
CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin();
137+
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class,
138+
() -> createTestAnalysis(idxSettings, settings, commonAnalysisPlugin));
139+
assertEquals("[standard_html_strip] analyzer is not supported for new indices, " +
140+
"use a custom analyzer using [standard] tokenizer and [html_strip] char_filter, plus [lowercase] filter", ex.getMessage());
141+
}
142+
143+
/**
144+
* Check that the deprecated analyzer name "standard_html_strip" issues a deprecation warning for indices created since 6.5.0 until 7
145+
*/
146+
public void testStandardHtmlStripAnalyzerDeprecationWarning() throws IOException {
147+
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
148+
.put(IndexMetaData.SETTING_VERSION_CREATED,
149+
VersionUtils.randomVersionBetween(random(), Version.V_6_0_0,
150+
VersionUtils.getPreviousVersion(Version.V_7_0_0)))
151+
.put("index.analysis.analyzer.custom_analyzer.type", "standard_html_strip")
152+
.putList("index.analysis.analyzer.custom_analyzer.stopwords", "a", "b")
153+
.build();
154+
155+
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
156+
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
157+
IndexAnalyzers analyzers = createTestAnalysis(idxSettings, settings, commonAnalysisPlugin).indexAnalyzers;
158+
Analyzer analyzer = analyzers.get("custom_analyzer");
159+
assertNotNull(((NamedAnalyzer) analyzer).analyzer());
160+
assertWarnings(
161+
"Deprecated analyzer [standard_html_strip] used, " +
162+
"replace it with a custom analyzer using [standard] tokenizer and [html_strip] char_filter, plus [lowercase] filter");
163+
}
164+
}
119165
}

modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/20_analyzers.yml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,14 +69,15 @@
6969

7070
---
7171
"standard_html_strip":
72+
- skip:
73+
version: " - 6.99.99"
74+
reason: only starting from version 7.x this throws an error
7275
- do:
76+
catch: /\[standard_html_strip\] analyzer is not supported for new indices, use a custom analyzer using \[standard\] tokenizer and \[html_strip\] char_filter, plus \[lowercase\] filter/
7377
indices.analyze:
7478
body:
7579
text: <bold/> <italic/>
7680
analyzer: standard_html_strip
77-
- length: { tokens: 2 }
78-
- match: { tokens.0.token: bold }
79-
- match: { tokens.1.token: italic }
8081

8182
---
8283
"pattern":

server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
import org.apache.lucene.analysis.Analyzer;
2222
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
23+
import org.elasticsearch.Version;
2324
import org.elasticsearch.core.internal.io.IOUtils;
2425
import org.elasticsearch.ElasticsearchException;
2526
import org.elasticsearch.cluster.metadata.IndexMetaData;
@@ -130,7 +131,13 @@ public Analyzer getAnalyzer(String analyzer) throws IOException {
130131
throw new ElasticsearchException("failed to load analyzer for name " + key, ex);
131132
}}
132133
);
134+
} else if ("standard_html_strip".equals(analyzer)) {
135+
if (Version.CURRENT.onOrAfter(Version.V_7_0_0)) {
136+
throw new IllegalArgumentException("[standard_html_strip] analyzer is not supported for new indices, " +
137+
"use a custom analyzer using [standard] tokenizer and [html_strip] char_filter, plus [lowercase] filter");
138+
}
133139
}
140+
134141
return analyzerProvider.get(environment, analyzer).get();
135142
}
136143

0 commit comments

Comments
 (0)