Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,327 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.index.mapper;

import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.Fuzziness;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.common.xcontent.support.XContentMapValues;
import org.elasticsearch.index.analysis.AnalyzerScope;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.query.QueryShardContext;

import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;

import static org.elasticsearch.index.mapper.TypeParsers.parseField;

/**
* A field mapper that accepts a JSON object and flattens it into a single field. This data type
* can be a useful alternative to an 'object' mapping when the object has a large, unknown set
* of keys.
*
* Currently the mapper extracts all leaf values of the JSON object, converts them to their text
* representations, and indexes each one as a keyword. As an example, given a json field called
* 'json_field' and the following input
*
* {
* "json_field: {
* "key1": "some value",
* "key2": {
* "key3": true
* }
* }
* }
*
* the mapper will produce untokenized string fields with the values "some value" and "true".
*/
public final class JsonFieldMapper extends FieldMapper {

public static final String CONTENT_TYPE = "json";
public static final NamedAnalyzer WHITESPACE_ANALYZER = new NamedAnalyzer(
"whitespace", AnalyzerScope.INDEX, new WhitespaceAnalyzer());

private static class Defaults {
public static final MappedFieldType FIELD_TYPE = new JsonFieldType();

static {
FIELD_TYPE.setTokenized(false);
FIELD_TYPE.setOmitNorms(true);
FIELD_TYPE.setStored(false);
FIELD_TYPE.setIndexOptions(IndexOptions.DOCS);
FIELD_TYPE.freeze();
}

public static final int IGNORE_ABOVE = Integer.MAX_VALUE;
}

public static class Builder extends FieldMapper.Builder<Builder, JsonFieldMapper> {
private int ignoreAbove = Defaults.IGNORE_ABOVE;

public Builder(String name) {
super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE);
builder = this;
}

@Override
public JsonFieldType fieldType() {
return (JsonFieldType) super.fieldType();
}

@Override
public Builder indexOptions(IndexOptions indexOptions) {
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) > 0) {
throw new IllegalArgumentException("The [" + CONTENT_TYPE
+ "] field does not support positions, got [index_options]="
+ indexOptionToString(indexOptions));
}
return super.indexOptions(indexOptions);
}

public Builder ignoreAbove(int ignoreAbove) {
if (ignoreAbove < 0) {
throw new IllegalArgumentException("[ignore_above] must be positive, got " + ignoreAbove);
}
this.ignoreAbove = ignoreAbove;
return this;
}

public Builder splitQueriesOnWhitespace(boolean splitQueriesOnWhitespace) {
fieldType().setSplitQueriesOnWhitespace(splitQueriesOnWhitespace);
return builder;
}
Comment thread
colings86 marked this conversation as resolved.
Outdated

@Override
public Builder addMultiField(Mapper.Builder mapperBuilder) {
throw new UnsupportedOperationException("[fields] is not supported for [" + CONTENT_TYPE + "] fields.");
}

@Override
public Builder copyTo(CopyTo copyTo) {
throw new UnsupportedOperationException("[copy_to] is not supported for [" + CONTENT_TYPE + "] fields.");
}

@Override
public Builder store(boolean store) {
throw new UnsupportedOperationException("[store] is not currently supported for [" +
CONTENT_TYPE + "] fields.");
}

@Override
public JsonFieldMapper build(BuilderContext context) {
setupFieldType(context);
if (fieldType().splitQueriesOnWhitespace()) {
fieldType().setSearchAnalyzer(WHITESPACE_ANALYZER);
}
return new JsonFieldMapper(name, fieldType, defaultFieldType,
ignoreAbove, context.indexSettings());
}
}

public static class TypeParser implements Mapper.TypeParser {
@Override
public Mapper.Builder<?,?> parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException {
JsonFieldMapper.Builder builder = new JsonFieldMapper.Builder(name);
parseField(builder, name, node, parserContext);
for (Iterator<Map.Entry<String, Object>> iterator = node.entrySet().iterator(); iterator.hasNext();) {
Map.Entry<String, Object> entry = iterator.next();
String propName = entry.getKey();
Object propNode = entry.getValue();
if (propName.equals("ignore_above")) {
builder.ignoreAbove(XContentMapValues.nodeIntegerValue(propNode, -1));
iterator.remove();
} else if (propName.equals("null_value")) {
if (propNode == null) {
throw new MapperParsingException("Property [null_value] cannot be null.");
}
builder.nullValue(propNode.toString());
iterator.remove();
} else if (propName.equals("split_queries_on_whitespace")) {
builder.splitQueriesOnWhitespace
(XContentMapValues.nodeBooleanValue(propNode, "split_queries_on_whitespace"));
iterator.remove();
}
}
return builder;
}
}

public static final class JsonFieldType extends StringFieldType {
private boolean splitQueriesOnWhitespace;

public JsonFieldType() {
setIndexAnalyzer(Lucene.KEYWORD_ANALYZER);
setSearchAnalyzer(Lucene.KEYWORD_ANALYZER);
}

private JsonFieldType(JsonFieldType ref) {
super(ref);
this.splitQueriesOnWhitespace = ref.splitQueriesOnWhitespace;
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
if (!super.equals(o)) return false;
JsonFieldType that = (JsonFieldType) o;
return splitQueriesOnWhitespace == that.splitQueriesOnWhitespace;
}

@Override
public int hashCode() {
return Objects.hash(super.hashCode(), splitQueriesOnWhitespace);
}

public JsonFieldType clone() {
return new JsonFieldType(this);
}

@Override
public String typeName() {
return CONTENT_TYPE;
}

public boolean splitQueriesOnWhitespace() {
return splitQueriesOnWhitespace;
}

public void setSplitQueriesOnWhitespace(boolean splitQueriesOnWhitespace) {
checkIfFrozen();
this.splitQueriesOnWhitespace = splitQueriesOnWhitespace;
}

@Override
public Query existsQuery(QueryShardContext context) {
return new TermQuery(new Term(FieldNamesFieldMapper.NAME, name()));
}

@Override
public Query fuzzyQuery(Object value, Fuzziness fuzziness, int prefixLength, int maxExpansions,
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm disallowing these more advanced queries for now, but will look into supporting them in a follow-up (tracked on the meta-issue).

In addition to fuzzy and regexp, I would also like to disallow wildcard. However, WildcardQueryBuilder doesn't delegate to a method on MappedFieldType. This means we attempt to create wildcard queries on non-text or keyword fields, and usually fail because the value can't be parsed. Do you know if there is a reason for this set-up, or if it's just an oversight that would be good to address?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure regarding why wildcard is different. I suspect its just because we haven't before needed for change the behaviour of wildcard queries based on the field type. I can't see any reason not to change it so we can control the behaviour here though. If we do make the change it might be best to make it directly on master and then pull the change into this branch to disallow it as the change may become difficult to maintain in the feature branch

boolean transpositions) {
throw new UnsupportedOperationException("[fuzzy] queries are not currently supported on [" +
CONTENT_TYPE + "] fields.");
}

@Override
public Query regexpQuery(String value, int flags, int maxDeterminizedStates,
MultiTermQuery.RewriteMethod method, QueryShardContext context) {
throw new UnsupportedOperationException("[regexp] queries are not currently supported on [" +
CONTENT_TYPE + "] fields.");
}

@Override
public Query wildcardQuery(String value,
MultiTermQuery.RewriteMethod method,
QueryShardContext context) {
throw new UnsupportedOperationException("[wildcard] queries are not currently supported on [" +
CONTENT_TYPE + "] fields.");
}

@Override
public Object valueForDisplay(Object value) {
if (value == null) {
return null;
}
BytesRef binaryValue = (BytesRef) value;
return binaryValue.utf8ToString();
}
}

private final JsonFieldParser fieldParser;
private int ignoreAbove;

private JsonFieldMapper(String simpleName,
MappedFieldType fieldType,
MappedFieldType defaultFieldType,
int ignoreAbove,
Settings indexSettings) {
super(simpleName, fieldType, defaultFieldType, indexSettings, MultiFields.empty(), CopyTo.empty());
assert fieldType.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) <= 0;

this.ignoreAbove = ignoreAbove;
this.fieldParser = new JsonFieldParser(fieldType, ignoreAbove);
}

@Override
protected String contentType() {
return CONTENT_TYPE;
}

@Override
protected void doMerge(Mapper mergeWith) {
super.doMerge(mergeWith);
this.ignoreAbove = ((JsonFieldMapper) mergeWith).ignoreAbove;
}

@Override
protected JsonFieldMapper clone() {
return (JsonFieldMapper) super.clone();
}

@Override
public JsonFieldType fieldType() {
return (JsonFieldType) super.fieldType();
}

@Override
protected void parseCreateField(ParseContext context, List<IndexableField> fields) throws IOException {
if (context.parser().currentToken() == XContentParser.Token.VALUE_NULL) {
return;
}

if (fieldType().indexOptions() != IndexOptions.NONE || fieldType().stored()) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the stored check here is obsolete since we aren't adding an option for a stored field as per https://github.com/elastic/elasticsearch/pull/33923/files#r219319713 below? We should probably also throw an exception if store(boolean) is called on the Builder?

Also I wonder if its worth even allowing the indexOptions to be set? Given you can't turn on term vectors or positions the only option would be to turn off indexing altogether which I don't think makes sense since we aren't allowing stored fields? Whats left would be the user wanting to ignore the field completely which is already covered by the object type's enabled" false setting?

Copy link
Copy Markdown
Contributor Author

@jtibshirani jtibshirani Sep 25, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The lucene fields we're adding are still stored if store: true, it's just that each value is added as a separate field (as opposed to adding a single stored field containing the whole JSON blob, as I was brainstorming). I think it'd be nice to see how this naive approach looks on the feature branch, so we can make an informed choice about whether we should switch to a single stored field.

For the index_options question, I guess the user could still decide between docs and freqs. I was mostly aiming for consistency with the keyword-like fields here. Is your thought that frequency information doesn't really make sense in this context either?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's just that each value is added as a separate field (as opposed to adding a single stored field containing the whole JSON blob, as I was brainstorming).

I think I'm still missed something here so maybe we can clear up my confusions with an example? Say we have a document as follows and json_field is a field of this new type:

{
    "json_field": {
        "a": "foo",
        "b": {
            "c": "bar"
        }
    }
}

Are you saying that if store: true we will create two stored fields in Lucene, one for json_field.a containing the value foo and one for json_field.b.c contain the value bar rather than crating a single json_field stored field containing the following?

{
    "a": "foo",
    "b": {
        "c": "bar"
    }
}

For the index_options question, I guess the user could still decide between docs and freqs.

True

I was mostly aiming for consistency with the keyword-like fields here. Is your thought that frequency information doesn't really make sense in this context either?

I guess the freqs would make sense for a search that is looking for e.g. "json_field": "foo" (i.e. "foo") anywhere in the JSON blob but it wouldn't really make sense for a search for e.g. "json_field.a": "foo" since we wouldn't have the frequencies for the inner field alone. I haven't thought about this too much but maybe for consistency between the two cases (did we say decide we were going to support both cases still?) it would be easier to explain without freqs?

Copy link
Copy Markdown
Contributor Author

@jtibshirani jtibshirani Sep 26, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your understanding of the current behavior for stored fields is correct -- in this PR, a separate stored field is created for each leaf value. I think that later we should switch to the approach where we create a single stored field with the whole JSON blob (and have made a note of this on the meta-issue), but I was curious to play around with this first more naive approach. I'm also okay disallowing stored fields for now until we address the issue in a dedicated PR.

it wouldn't really make sense for a search for e.g. "json_field.a": "foo" since we wouldn't have the frequencies for the inner field alone

I think we would have these frequencies, because we'd calculate the frequency of the term a\0foo (where \0 is a separator character)? And you're right, we are supporting both of those cases.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we would have these frequencies, because we'd calculate the frequency of the term a\0foo (where \0 is a separator character)?

Good point. In which case I agree that index options do make sense.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After catching up with @romseygeek and @colings86 offline, I decided to disable stored fields in this PR to cut down on confusion.

fields.addAll(fieldParser.parse(context.parser()));
createFieldNamesField(context, fields);
} else {
context.parser().skipChildren();
}
}

@Override
protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, Params params) throws IOException {
super.doXContentBody(builder, includeDefaults, params);

if (includeDefaults || fieldType().nullValue() != null) {
builder.field("null_value", fieldType().nullValue());
}

if (includeDefaults || ignoreAbove != Defaults.IGNORE_ABOVE) {
builder.field("ignore_above", ignoreAbove);
}

if (includeDefaults || fieldType().splitQueriesOnWhitespace()) {
builder.field("split_queries_on_whitespace", fieldType().splitQueriesOnWhitespace());
}
}
}
Loading