-
Notifications
You must be signed in to change notification settings - Fork 25.9k
Add a simple JSON field mapper. #33923
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
768a5d7
09d0301
6d74f54
6d58cca
236d568
26884b6
50e6439
bef3f0e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,327 @@ | ||
| /* | ||
| * Licensed to Elasticsearch under one or more contributor | ||
| * license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright | ||
| * ownership. Elasticsearch licenses this file to you under | ||
| * the Apache License, Version 2.0 (the "License"); you may | ||
| * not use this file except in compliance with the License. | ||
| * You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| package org.elasticsearch.index.mapper; | ||
|
|
||
| import org.apache.lucene.analysis.core.WhitespaceAnalyzer; | ||
| import org.apache.lucene.index.IndexOptions; | ||
| import org.apache.lucene.index.IndexableField; | ||
| import org.apache.lucene.index.Term; | ||
| import org.apache.lucene.search.MultiTermQuery; | ||
| import org.apache.lucene.search.Query; | ||
| import org.apache.lucene.search.TermQuery; | ||
| import org.apache.lucene.util.BytesRef; | ||
| import org.elasticsearch.common.lucene.Lucene; | ||
| import org.elasticsearch.common.settings.Settings; | ||
| import org.elasticsearch.common.unit.Fuzziness; | ||
| import org.elasticsearch.common.xcontent.XContentBuilder; | ||
| import org.elasticsearch.common.xcontent.XContentParser; | ||
| import org.elasticsearch.common.xcontent.support.XContentMapValues; | ||
| import org.elasticsearch.index.analysis.AnalyzerScope; | ||
| import org.elasticsearch.index.analysis.NamedAnalyzer; | ||
| import org.elasticsearch.index.query.QueryShardContext; | ||
|
|
||
| import java.io.IOException; | ||
| import java.util.Iterator; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.Objects; | ||
|
|
||
| import static org.elasticsearch.index.mapper.TypeParsers.parseField; | ||
|
|
||
| /** | ||
| * A field mapper that accepts a JSON object and flattens it into a single field. This data type | ||
| * can be a useful alternative to an 'object' mapping when the object has a large, unknown set | ||
| * of keys. | ||
| * | ||
| * Currently the mapper extracts all leaf values of the JSON object, converts them to their text | ||
| * representations, and indexes each one as a keyword. As an example, given a json field called | ||
| * 'json_field' and the following input | ||
| * | ||
| * { | ||
| * "json_field: { | ||
| * "key1": "some value", | ||
| * "key2": { | ||
| * "key3": true | ||
| * } | ||
| * } | ||
| * } | ||
| * | ||
| * the mapper will produce untokenized string fields with the values "some value" and "true". | ||
| */ | ||
| public final class JsonFieldMapper extends FieldMapper { | ||
|
|
||
| public static final String CONTENT_TYPE = "json"; | ||
| public static final NamedAnalyzer WHITESPACE_ANALYZER = new NamedAnalyzer( | ||
| "whitespace", AnalyzerScope.INDEX, new WhitespaceAnalyzer()); | ||
|
|
||
| private static class Defaults { | ||
| public static final MappedFieldType FIELD_TYPE = new JsonFieldType(); | ||
|
|
||
| static { | ||
| FIELD_TYPE.setTokenized(false); | ||
| FIELD_TYPE.setOmitNorms(true); | ||
| FIELD_TYPE.setStored(false); | ||
| FIELD_TYPE.setIndexOptions(IndexOptions.DOCS); | ||
| FIELD_TYPE.freeze(); | ||
| } | ||
|
|
||
| public static final int IGNORE_ABOVE = Integer.MAX_VALUE; | ||
| } | ||
|
|
||
| public static class Builder extends FieldMapper.Builder<Builder, JsonFieldMapper> { | ||
| private int ignoreAbove = Defaults.IGNORE_ABOVE; | ||
|
|
||
| public Builder(String name) { | ||
| super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE); | ||
| builder = this; | ||
| } | ||
|
|
||
| @Override | ||
| public JsonFieldType fieldType() { | ||
| return (JsonFieldType) super.fieldType(); | ||
| } | ||
|
|
||
| @Override | ||
| public Builder indexOptions(IndexOptions indexOptions) { | ||
| if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) > 0) { | ||
| throw new IllegalArgumentException("The [" + CONTENT_TYPE | ||
| + "] field does not support positions, got [index_options]=" | ||
| + indexOptionToString(indexOptions)); | ||
| } | ||
| return super.indexOptions(indexOptions); | ||
| } | ||
|
|
||
| public Builder ignoreAbove(int ignoreAbove) { | ||
| if (ignoreAbove < 0) { | ||
| throw new IllegalArgumentException("[ignore_above] must be positive, got " + ignoreAbove); | ||
| } | ||
| this.ignoreAbove = ignoreAbove; | ||
| return this; | ||
| } | ||
|
|
||
| public Builder splitQueriesOnWhitespace(boolean splitQueriesOnWhitespace) { | ||
| fieldType().setSplitQueriesOnWhitespace(splitQueriesOnWhitespace); | ||
| return builder; | ||
| } | ||
|
|
||
| @Override | ||
| public Builder addMultiField(Mapper.Builder mapperBuilder) { | ||
| throw new UnsupportedOperationException("[fields] is not supported for [" + CONTENT_TYPE + "] fields."); | ||
| } | ||
|
|
||
| @Override | ||
| public Builder copyTo(CopyTo copyTo) { | ||
| throw new UnsupportedOperationException("[copy_to] is not supported for [" + CONTENT_TYPE + "] fields."); | ||
| } | ||
|
|
||
| @Override | ||
| public Builder store(boolean store) { | ||
| throw new UnsupportedOperationException("[store] is not currently supported for [" + | ||
| CONTENT_TYPE + "] fields."); | ||
| } | ||
|
|
||
| @Override | ||
| public JsonFieldMapper build(BuilderContext context) { | ||
| setupFieldType(context); | ||
| if (fieldType().splitQueriesOnWhitespace()) { | ||
| fieldType().setSearchAnalyzer(WHITESPACE_ANALYZER); | ||
| } | ||
| return new JsonFieldMapper(name, fieldType, defaultFieldType, | ||
| ignoreAbove, context.indexSettings()); | ||
| } | ||
| } | ||
|
|
||
| public static class TypeParser implements Mapper.TypeParser { | ||
| @Override | ||
| public Mapper.Builder<?,?> parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException { | ||
| JsonFieldMapper.Builder builder = new JsonFieldMapper.Builder(name); | ||
| parseField(builder, name, node, parserContext); | ||
| for (Iterator<Map.Entry<String, Object>> iterator = node.entrySet().iterator(); iterator.hasNext();) { | ||
| Map.Entry<String, Object> entry = iterator.next(); | ||
| String propName = entry.getKey(); | ||
| Object propNode = entry.getValue(); | ||
| if (propName.equals("ignore_above")) { | ||
| builder.ignoreAbove(XContentMapValues.nodeIntegerValue(propNode, -1)); | ||
| iterator.remove(); | ||
| } else if (propName.equals("null_value")) { | ||
| if (propNode == null) { | ||
| throw new MapperParsingException("Property [null_value] cannot be null."); | ||
| } | ||
| builder.nullValue(propNode.toString()); | ||
| iterator.remove(); | ||
| } else if (propName.equals("split_queries_on_whitespace")) { | ||
| builder.splitQueriesOnWhitespace | ||
| (XContentMapValues.nodeBooleanValue(propNode, "split_queries_on_whitespace")); | ||
| iterator.remove(); | ||
| } | ||
| } | ||
| return builder; | ||
| } | ||
| } | ||
|
|
||
| public static final class JsonFieldType extends StringFieldType { | ||
| private boolean splitQueriesOnWhitespace; | ||
|
|
||
| public JsonFieldType() { | ||
| setIndexAnalyzer(Lucene.KEYWORD_ANALYZER); | ||
| setSearchAnalyzer(Lucene.KEYWORD_ANALYZER); | ||
| } | ||
|
|
||
| private JsonFieldType(JsonFieldType ref) { | ||
| super(ref); | ||
| this.splitQueriesOnWhitespace = ref.splitQueriesOnWhitespace; | ||
| } | ||
|
|
||
| @Override | ||
| public boolean equals(Object o) { | ||
| if (this == o) return true; | ||
| if (o == null || getClass() != o.getClass()) return false; | ||
| if (!super.equals(o)) return false; | ||
| JsonFieldType that = (JsonFieldType) o; | ||
| return splitQueriesOnWhitespace == that.splitQueriesOnWhitespace; | ||
| } | ||
|
|
||
| @Override | ||
| public int hashCode() { | ||
| return Objects.hash(super.hashCode(), splitQueriesOnWhitespace); | ||
| } | ||
|
|
||
| public JsonFieldType clone() { | ||
| return new JsonFieldType(this); | ||
| } | ||
|
|
||
| @Override | ||
| public String typeName() { | ||
| return CONTENT_TYPE; | ||
| } | ||
|
|
||
| public boolean splitQueriesOnWhitespace() { | ||
| return splitQueriesOnWhitespace; | ||
| } | ||
|
|
||
| public void setSplitQueriesOnWhitespace(boolean splitQueriesOnWhitespace) { | ||
| checkIfFrozen(); | ||
| this.splitQueriesOnWhitespace = splitQueriesOnWhitespace; | ||
| } | ||
|
|
||
| @Override | ||
| public Query existsQuery(QueryShardContext context) { | ||
| return new TermQuery(new Term(FieldNamesFieldMapper.NAME, name())); | ||
| } | ||
|
|
||
| @Override | ||
| public Query fuzzyQuery(Object value, Fuzziness fuzziness, int prefixLength, int maxExpansions, | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm disallowing these more advanced queries for now, but will look into supporting them in a follow-up (tracked on the meta-issue). In addition to
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure regarding why wildcard is different. I suspect its just because we haven't before needed for change the behaviour of wildcard queries based on the field type. I can't see any reason not to change it so we can control the behaviour here though. If we do make the change it might be best to make it directly on master and then pull the change into this branch to disallow it as the change may become difficult to maintain in the feature branch |
||
| boolean transpositions) { | ||
| throw new UnsupportedOperationException("[fuzzy] queries are not currently supported on [" + | ||
| CONTENT_TYPE + "] fields."); | ||
| } | ||
|
|
||
| @Override | ||
| public Query regexpQuery(String value, int flags, int maxDeterminizedStates, | ||
| MultiTermQuery.RewriteMethod method, QueryShardContext context) { | ||
| throw new UnsupportedOperationException("[regexp] queries are not currently supported on [" + | ||
| CONTENT_TYPE + "] fields."); | ||
| } | ||
|
|
||
| @Override | ||
| public Query wildcardQuery(String value, | ||
| MultiTermQuery.RewriteMethod method, | ||
| QueryShardContext context) { | ||
| throw new UnsupportedOperationException("[wildcard] queries are not currently supported on [" + | ||
| CONTENT_TYPE + "] fields."); | ||
| } | ||
|
|
||
| @Override | ||
| public Object valueForDisplay(Object value) { | ||
| if (value == null) { | ||
| return null; | ||
| } | ||
| BytesRef binaryValue = (BytesRef) value; | ||
| return binaryValue.utf8ToString(); | ||
| } | ||
| } | ||
|
|
||
| private final JsonFieldParser fieldParser; | ||
| private int ignoreAbove; | ||
|
|
||
| private JsonFieldMapper(String simpleName, | ||
| MappedFieldType fieldType, | ||
| MappedFieldType defaultFieldType, | ||
| int ignoreAbove, | ||
| Settings indexSettings) { | ||
| super(simpleName, fieldType, defaultFieldType, indexSettings, MultiFields.empty(), CopyTo.empty()); | ||
| assert fieldType.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) <= 0; | ||
|
|
||
| this.ignoreAbove = ignoreAbove; | ||
| this.fieldParser = new JsonFieldParser(fieldType, ignoreAbove); | ||
| } | ||
|
|
||
| @Override | ||
| protected String contentType() { | ||
| return CONTENT_TYPE; | ||
| } | ||
|
|
||
| @Override | ||
| protected void doMerge(Mapper mergeWith) { | ||
| super.doMerge(mergeWith); | ||
| this.ignoreAbove = ((JsonFieldMapper) mergeWith).ignoreAbove; | ||
| } | ||
|
|
||
| @Override | ||
| protected JsonFieldMapper clone() { | ||
| return (JsonFieldMapper) super.clone(); | ||
| } | ||
|
|
||
| @Override | ||
| public JsonFieldType fieldType() { | ||
| return (JsonFieldType) super.fieldType(); | ||
| } | ||
|
|
||
| @Override | ||
| protected void parseCreateField(ParseContext context, List<IndexableField> fields) throws IOException { | ||
| if (context.parser().currentToken() == XContentParser.Token.VALUE_NULL) { | ||
| return; | ||
| } | ||
|
|
||
| if (fieldType().indexOptions() != IndexOptions.NONE || fieldType().stored()) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the stored check here is obsolete since we aren't adding an option for a stored field as per https://github.com/elastic/elasticsearch/pull/33923/files#r219319713 below? We should probably also throw an exception if Also I wonder if its worth even allowing the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The lucene fields we're adding are still stored if For the
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think I'm still missed something here so maybe we can clear up my confusions with an example? Say we have a document as follows and Are you saying that if
True
I guess the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Your understanding of the current behavior for stored fields is correct -- in this PR, a separate stored field is created for each leaf value. I think that later we should switch to the approach where we create a single stored field with the whole JSON blob (and have made a note of this on the meta-issue), but I was curious to play around with this first more naive approach. I'm also okay disallowing stored fields for now until we address the issue in a dedicated PR.
I think we would have these frequencies, because we'd calculate the frequency of the term
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Good point. In which case I agree that index options do make sense.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After catching up with @romseygeek and @colings86 offline, I decided to disable stored fields in this PR to cut down on confusion. |
||
| fields.addAll(fieldParser.parse(context.parser())); | ||
| createFieldNamesField(context, fields); | ||
| } else { | ||
| context.parser().skipChildren(); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, Params params) throws IOException { | ||
| super.doXContentBody(builder, includeDefaults, params); | ||
|
|
||
| if (includeDefaults || fieldType().nullValue() != null) { | ||
| builder.field("null_value", fieldType().nullValue()); | ||
| } | ||
|
|
||
| if (includeDefaults || ignoreAbove != Defaults.IGNORE_ABOVE) { | ||
| builder.field("ignore_above", ignoreAbove); | ||
| } | ||
|
|
||
| if (includeDefaults || fieldType().splitQueriesOnWhitespace()) { | ||
| builder.field("split_queries_on_whitespace", fieldType().splitQueriesOnWhitespace()); | ||
| } | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.