Skip to content

Commit fe9145f

Browse files
authored
Search - add case insensitive flag for "term" family of queries (#61596)
Adds case insensitive flag for term, prefix, and wildcard queries Closes #61546
1 parent dd11f5f commit fe9145f

File tree

46 files changed

+914
-134
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+914
-134
lines changed

docs/reference/query-dsl/prefix-query.asciidoc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,10 @@ provided `<field>`.
4141
(Optional, string) Method used to rewrite the query. For valid values and more
4242
information, see the <<query-dsl-multi-term-rewrite, `rewrite` parameter>>.
4343

44+
`case_insensitive`::
45+
(Optional, boolean) allows ASCII case insensitive matching of the
46+
value with the indexed field values when set to true. Setting to false is disallowed.
47+
4448
[[prefix-query-notes]]
4549
==== Notes
4650

docs/reference/query-dsl/term-query.asciidoc

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ Boost values are relative to the default value of `1.0`. A boost value between
6262
`0` and `1.0` decreases the relevance score. A value greater than `1.0`
6363
increases the relevance score.
6464

65+
`case_insensitive`::
66+
(Optional, boolean) allows ASCII case insensitive matching of the
67+
value with the indexed field values when set to true. Setting to false is disallowed.
68+
6569
[[term-query-notes]]
6670
==== Notes
6771

@@ -84,7 +88,7 @@ The `term` query does *not* analyze the search term. The `term` query only
8488
searches for the *exact* term you provide. This means the `term` query may
8589
return poor or no results when searching `text` fields.
8690

87-
To see the difference in search results, try the following example.
91+
To see the difference in search results, try the following example.
8892

8993
. Create an index with a `text` field called `full_text`.
9094
+
@@ -213,4 +217,4 @@ in the results.
213217
}
214218
----
215219
// TESTRESPONSE[s/"took" : 1/"took" : $body.took/]
216-
--
220+
--

docs/reference/query-dsl/wildcard-query.asciidoc

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ This parameter supports two wildcard operators:
5252

5353
WARNING: Avoid beginning patterns with `*` or `?`. This can increase
5454
the iterations needed to find matching terms and slow search performance.
55-
--
55+
--
5656

5757
`boost`::
5858
(Optional, float) Floating point number used to decrease or increase the
@@ -69,6 +69,10 @@ increases the relevance score.
6969
(Optional, string) Method used to rewrite the query. For valid values and more information, see the
7070
<<query-dsl-multi-term-rewrite, `rewrite` parameter>>.
7171

72+
`case_insensitive`::
73+
(Optional, boolean) allows case insensitive matching of the
74+
pattern with the indexed field values when set to true. Setting to false is disallowed.
75+
7276
[[wildcard-query-notes]]
7377
==== Notes
7478
===== Allow expensive queries

modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/SearchAsYouTypeFieldMapper.java

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -281,11 +281,11 @@ public Query existsQuery(QueryShardContext context) {
281281
}
282282

283283
@Override
284-
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
284+
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, boolean caseInsensitive, QueryShardContext context) {
285285
if (prefixField == null || prefixField.termLengthWithinBounds(value.length()) == false) {
286-
return super.prefixQuery(value, method, context);
286+
return super.prefixQuery(value, method, caseInsensitive, context);
287287
} else {
288-
final Query query = prefixField.prefixQuery(value, method, context);
288+
final Query query = prefixField.prefixQuery(value, method, caseInsensitive, context);
289289
if (method == null
290290
|| method == MultiTermQuery.CONSTANT_SCORE_REWRITE
291291
|| method == MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE) {
@@ -365,8 +365,11 @@ boolean termLengthWithinBounds(int length) {
365365
}
366366

367367
@Override
368-
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
368+
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, boolean caseInsensitive, QueryShardContext context) {
369369
if (value.length() >= minChars) {
370+
if(caseInsensitive) {
371+
return super.termQueryCaseInsensitive(value, context);
372+
}
370373
return super.termQuery(value, context);
371374
}
372375
List<Automaton> automata = new ArrayList<>();
@@ -507,11 +510,11 @@ public Query existsQuery(QueryShardContext context) {
507510
}
508511

509512
@Override
510-
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
513+
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, boolean caseInsensitive, QueryShardContext context) {
511514
if (prefixFieldType == null || prefixFieldType.termLengthWithinBounds(value.length()) == false) {
512-
return super.prefixQuery(value, method, context);
515+
return super.prefixQuery(value, method, caseInsensitive, context);
513516
} else {
514-
final Query query = prefixFieldType.prefixQuery(value, method, context);
517+
final Query query = prefixFieldType.prefixQuery(value, method, caseInsensitive, context);
515518
if (method == null
516519
|| method == MultiTermQuery.CONSTANT_SCORE_REWRITE
517520
|| method == MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE) {

plugins/analysis-icu/src/main/java/org/elasticsearch/index/mapper/ICUCollationKeywordFieldMapper.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,13 +136,15 @@ public Query fuzzyQuery(Object value, Fuzziness fuzziness, int prefixLength, int
136136
}
137137

138138
@Override
139-
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
139+
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method,
140+
boolean caseInsensitive, QueryShardContext context) {
140141
throw new UnsupportedOperationException("[prefix] queries are not supported on [" + CONTENT_TYPE + "] fields.");
141142
}
142143

143144
@Override
144145
public Query wildcardQuery(String value,
145146
@Nullable MultiTermQuery.RewriteMethod method,
147+
boolean caseInsensitive,
146148
QueryShardContext context) {
147149
throw new UnsupportedOperationException("[wildcard] queries are not supported on [" + CONTENT_TYPE + "] fields.");
148150
}

server/src/main/java/org/elasticsearch/common/Strings.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -879,4 +879,18 @@ public static String padStart(String s, int minimumLength, char c) {
879879
return sb.toString();
880880
}
881881
}
882+
883+
public static String toLowercaseAscii(String in) {
884+
StringBuilder out = new StringBuilder();
885+
Iterator<Integer> iter = in.codePoints().iterator();
886+
while (iter.hasNext()) {
887+
int codepoint = iter.next();
888+
if (codepoint > 128) {
889+
out.appendCodePoint(codepoint);
890+
} else {
891+
out.appendCodePoint(Character.toLowerCase(codepoint));
892+
}
893+
}
894+
return out.toString();
895+
}
882896
}
Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.common.lucene.search;
21+
22+
import org.apache.lucene.index.Term;
23+
import org.apache.lucene.search.AutomatonQuery;
24+
import org.apache.lucene.util.BytesRef;
25+
import org.apache.lucene.util.automaton.Automata;
26+
import org.apache.lucene.util.automaton.Automaton;
27+
import org.apache.lucene.util.automaton.MinimizationOperations;
28+
import org.apache.lucene.util.automaton.Operations;
29+
30+
import java.util.ArrayList;
31+
import java.util.Iterator;
32+
import java.util.List;
33+
34+
/**
35+
* Helper functions for creating various forms of {@link AutomatonQuery}
36+
*/
37+
public class AutomatonQueries {
38+
39+
40+
41+
/** Build an automaton query accepting all terms with the specified prefix, ASCII case insensitive. */
42+
public static Automaton caseInsensitivePrefix(String s) {
43+
List<Automaton> list = new ArrayList<>();
44+
Iterator<Integer> iter = s.codePoints().iterator();
45+
while (iter.hasNext()) {
46+
list.add(toCaseInsensitiveChar(iter.next(), Integer.MAX_VALUE));
47+
}
48+
list.add(Automata.makeAnyString());
49+
50+
Automaton a = Operations.concatenate(list);
51+
a = MinimizationOperations.minimize(a, Integer.MAX_VALUE);
52+
return a;
53+
}
54+
55+
56+
/** Build an automaton query accepting all terms with the specified prefix, ASCII case insensitive. */
57+
public static AutomatonQuery caseInsensitivePrefixQuery(Term prefix) {
58+
return new AutomatonQuery(prefix, caseInsensitivePrefix(prefix.text()));
59+
}
60+
61+
/** Build an automaton accepting all terms ASCII case insensitive. */
62+
public static AutomatonQuery caseInsensitiveTermQuery(Term term) {
63+
BytesRef prefix = term.bytes();
64+
return new AutomatonQuery(term, toCaseInsensitiveString(prefix,Integer.MAX_VALUE));
65+
}
66+
67+
68+
/** Build an automaton matching a wildcard pattern, ASCII case insensitive. */
69+
public static AutomatonQuery caseInsensitiveWildcardQuery(Term wildcardquery) {
70+
return new AutomatonQuery(wildcardquery, toCaseInsensitiveWildcardAutomaton(wildcardquery,Integer.MAX_VALUE));
71+
}
72+
73+
74+
/** String equality with support for wildcards */
75+
public static final char WILDCARD_STRING = '*';
76+
77+
/** Char equality with support for wildcards */
78+
public static final char WILDCARD_CHAR = '?';
79+
80+
/** Escape character */
81+
public static final char WILDCARD_ESCAPE = '\\';
82+
/**
83+
* Convert Lucene wildcard syntax into an automaton.
84+
*/
85+
@SuppressWarnings("fallthrough")
86+
public static Automaton toCaseInsensitiveWildcardAutomaton(Term wildcardquery, int maxDeterminizedStates) {
87+
List<Automaton> automata = new ArrayList<>();
88+
89+
String wildcardText = wildcardquery.text();
90+
91+
for (int i = 0; i < wildcardText.length();) {
92+
final int c = wildcardText.codePointAt(i);
93+
int length = Character.charCount(c);
94+
switch(c) {
95+
case WILDCARD_STRING:
96+
automata.add(Automata.makeAnyString());
97+
break;
98+
case WILDCARD_CHAR:
99+
automata.add(Automata.makeAnyChar());
100+
break;
101+
case WILDCARD_ESCAPE:
102+
// add the next codepoint instead, if it exists
103+
if (i + length < wildcardText.length()) {
104+
final int nextChar = wildcardText.codePointAt(i + length);
105+
length += Character.charCount(nextChar);
106+
automata.add(Automata.makeChar(nextChar));
107+
break;
108+
} // else fallthru, lenient parsing with a trailing \
109+
default:
110+
automata.add(toCaseInsensitiveChar(c, maxDeterminizedStates));
111+
}
112+
i += length;
113+
}
114+
115+
return Operations.concatenate(automata);
116+
}
117+
118+
protected static Automaton toCaseInsensitiveString(BytesRef br, int maxDeterminizedStates) {
119+
return toCaseInsensitiveString(br.utf8ToString(), maxDeterminizedStates);
120+
}
121+
122+
public static Automaton toCaseInsensitiveString(String s, int maxDeterminizedStates) {
123+
List<Automaton> list = new ArrayList<>();
124+
Iterator<Integer> iter = s.codePoints().iterator();
125+
while (iter.hasNext()) {
126+
list.add(toCaseInsensitiveChar(iter.next(), maxDeterminizedStates));
127+
}
128+
129+
Automaton a = Operations.concatenate(list);
130+
a = MinimizationOperations.minimize(a, maxDeterminizedStates);
131+
return a;
132+
133+
134+
}
135+
136+
protected static Automaton toCaseInsensitiveChar(int codepoint, int maxDeterminizedStates) {
137+
Automaton case1 = Automata.makeChar(codepoint);
138+
// For now we only work with ASCII characters
139+
if (codepoint > 128) {
140+
return case1;
141+
}
142+
int altCase = Character.isLowerCase(codepoint) ? Character.toUpperCase(codepoint) : Character.toLowerCase(codepoint);
143+
Automaton result;
144+
if (altCase != codepoint) {
145+
result = Operations.union(case1, Automata.makeChar(altCase));
146+
result = MinimizationOperations.minimize(result, maxDeterminizedStates);
147+
} else {
148+
result = case1;
149+
}
150+
return result;
151+
}
152+
}

server/src/main/java/org/elasticsearch/common/regex/Regex.java

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -79,15 +79,39 @@ public static Automaton simpleMatchToAutomaton(String... patterns) {
7979
* Match a String against the given pattern, supporting the following simple
8080
* pattern styles: "xxx*", "*xxx", "*xxx*" and "xxx*yyy" matches (with an
8181
* arbitrary number of pattern parts), as well as direct equality.
82+
* Matching is case sensitive.
8283
*
8384
* @param pattern the pattern to match against
8485
* @param str the String to match
8586
* @return whether the String matches the given pattern
8687
*/
8788
public static boolean simpleMatch(String pattern, String str) {
89+
return simpleMatch(pattern, str, false);
90+
}
91+
92+
93+
/**
94+
* Match a String against the given pattern, supporting the following simple
95+
* pattern styles: "xxx*", "*xxx", "*xxx*" and "xxx*yyy" matches (with an
96+
* arbitrary number of pattern parts), as well as direct equality.
97+
*
98+
* @param pattern the pattern to match against
99+
* @param str the String to match
100+
* @param caseInsensitive true if ASCII case differences should be ignored
101+
* @return whether the String matches the given pattern
102+
*/
103+
public static boolean simpleMatch(String pattern, String str, boolean caseInsensitive) {
88104
if (pattern == null || str == null) {
89105
return false;
90106
}
107+
if (caseInsensitive) {
108+
pattern = Strings.toLowercaseAscii(pattern);
109+
str = Strings.toLowercaseAscii(str);
110+
}
111+
return simpleMatchWithNormalizedStrings(pattern, str);
112+
}
113+
114+
private static boolean simpleMatchWithNormalizedStrings(String pattern, String str) {
91115
final int firstIndex = pattern.indexOf('*');
92116
if (firstIndex == -1) {
93117
return pattern.equals(str);
@@ -102,12 +126,12 @@ public static boolean simpleMatch(String pattern, String str) {
102126
return str.regionMatches(str.length() - pattern.length() + 1, pattern, 1, pattern.length() - 1);
103127
} else if (nextIndex == 1) {
104128
// Double wildcard "**" - skipping the first "*"
105-
return simpleMatch(pattern.substring(1), str);
129+
return simpleMatchWithNormalizedStrings(pattern.substring(1), str);
106130
}
107131
final String part = pattern.substring(1, nextIndex);
108132
int partIndex = str.indexOf(part);
109133
while (partIndex != -1) {
110-
if (simpleMatch(pattern.substring(nextIndex), str.substring(partIndex + part.length()))) {
134+
if (simpleMatchWithNormalizedStrings(pattern.substring(nextIndex), str.substring(partIndex + part.length()))) {
111135
return true;
112136
}
113137
partIndex = str.indexOf(part, partIndex + 1);
@@ -116,9 +140,9 @@ public static boolean simpleMatch(String pattern, String str) {
116140
}
117141
return str.regionMatches(0, pattern, 0, firstIndex)
118142
&& (firstIndex == pattern.length() - 1 // only wildcard in pattern is at the end, so no need to look at the rest of the string
119-
|| simpleMatch(pattern.substring(firstIndex), str.substring(firstIndex)));
120-
}
121-
143+
|| simpleMatchWithNormalizedStrings(pattern.substring(firstIndex), str.substring(firstIndex)));
144+
}
145+
122146
/**
123147
* Match a String against the given patterns, supporting the following simple
124148
* pattern styles: "xxx*", "*xxx", "*xxx*" and "xxx*yyy" matches (with an

0 commit comments

Comments
 (0)