Skip to content

Commit 8e7f91f

Browse files
authored
[Tooling] Add --exclude flag to Generator to support field removal testing (#1411) (#1432)
* add --exclude flag to Generator
1 parent f2e013b commit 8e7f91f

17 files changed

Lines changed: 308 additions & 79 deletions

CHANGELOG.next.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ Thanks, you're awesome :-) -->
1919
#### Improvements
2020

2121
* Fix ecs GitHub repo link source branch #1393
22+
* Add --exclude flag to Generator to support field removal testing #1411
2223

2324
#### Deprecated
2425

USAGE.md

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ relevant artifacts for their unique set of data sources.
2626
* [Generator Options](#generator-options)
2727
+ [Out](#out)
2828
+ [Include](#include)
29+
+ [Exclude](#exclude)
2930
+ [Subset](#subset)
3031
+ [Ref](#ref)
3132
+ [Mapping & Template Settings](#mapping--template-settings)
@@ -192,6 +193,41 @@ Include can be used together with the `--ref` flag to merge custom fields into a
192193

193194
> NOTE: The `--include` mechanism will not validate custom YAML files prior to merging. This allows for modifying existing ECS fields in a custom schema without having to redefine all the mandatory field attributes.
194195
196+
#### Exclude
197+
198+
Use the `--exclude` flag to generate ephemeral ECS artifacts based on the current ECS schema field definitions minus fields considered for removal, e.g. to assess impact of removing these. Warning! This is not the recommended route to remove a field permanently as it is not intentended to be invoked during the build process. Definitive field removal should be implemented using a custom [Subset](#subset) or via the [RFC process](https://github.com/elastic/ecs/tree/master/rfcs/README.md). Example:
199+
200+
```
201+
$ python scripts/generator.py --exclude=../my-project/my-exclude-file.yml
202+
$ python scripts/generator.py --exclude="../my-project/schemas/a*.yml"
203+
```
204+
205+
The `--exclude` flag expects a path to one or more YAML files using the same [file format](https://github.com/elastic/ecs/tree/master/schemas#fields-supported-in-schemasyml) as the ECS schema files. You can also use a subset, provided that relevant `name` and `fields` fields are preserved.
206+
207+
```
208+
---
209+
- name: log
210+
fields:
211+
- name: original
212+
```
213+
214+
The root Field Set `name` must always be present and specified with no dots `.`. Subfields may be specified using dot notation, for example:
215+
216+
```
217+
---
218+
- name: log
219+
fields:
220+
- name: syslog.severity.name
221+
```
222+
223+
Generate artifacts using `--exclude` to load our custom definitions in addition to `--out` to place them in the desired output directory:
224+
225+
```
226+
$ python scripts/generator.py --exclude ../myproject/exclude-set.yml/ --out ../myproject/out/
227+
Loading schemas from local files
228+
Running generator. ECS version 1.11.0
229+
```
230+
195231
#### Subset
196232

197233
If your indices will never populate particular ECS fields, there's no need to include those field definitions in your index mappings. The `--subset` argument allows for passing a subset definition YAML file which indicates which field sets or specific fields to include in the generated artifacts.

scripts/generator.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from schema import cleaner
1717
from schema import finalizer
1818
from schema import subset_filter
19+
from schema import exclude_filter
1920

2021

2122
def main():
@@ -51,6 +52,7 @@ def main():
5152
cleaner.clean(fields, strict=args.strict)
5253
finalizer.finalize(fields)
5354
fields = subset_filter.filter(fields, args.subset, out_dir)
55+
fields = exclude_filter.exclude(fields, args.exclude)
5456
nested, flat = intermediate_files.generate(fields, os.path.join(out_dir, 'ecs'), default_dirs)
5557

5658
if args.intermediate_only:
@@ -60,7 +62,7 @@ def main():
6062
es_template.generate(nested, ecs_generated_version, out_dir, args.mapping_settings)
6163
es_template.generate_legacy(flat, ecs_generated_version, out_dir, args.template_settings, args.mapping_settings)
6264
beats.generate(nested, ecs_generated_version, out_dir)
63-
if args.include or args.subset:
65+
if args.include or args.subset or args.exclude:
6466
exit()
6567

6668
ecs_helpers.make_dirs(docs_dir)
@@ -73,6 +75,8 @@ def argument_parser():
7375
Note that "--include experimental/schemas" will also respect this git ref.')
7476
parser.add_argument('--include', nargs='+',
7577
help='include user specified directory of custom field definitions')
78+
parser.add_argument('--exclude', nargs='+',
79+
help='exclude user specified subset of the schema')
7680
parser.add_argument('--subset', nargs='+',
7781
help='render a subset of the schema')
7882
parser.add_argument('--out', action='store', help='directory to output the generated files')

scripts/generators/beats.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,9 +83,9 @@ def write_beats_yaml(beats_file, ecs_version, out_dir):
8383

8484

8585
def file_header():
86-
return '''
86+
return """
8787
# WARNING! Do not edit this file directly, it was generated by the ECS project,
8888
# based on ECS version {version}.
8989
# Please visit https://github.com/elastic/ecs to suggest changes to ECS fields.
9090
91-
'''.lstrip()
91+
""".lstrip()

scripts/generators/ecs_helpers.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ def yaml_load(filename):
159159

160160

161161
def list_subtract(original, subtracted):
162-
'''Subtract two lists. original = subtracted'''
162+
"""Subtract two lists. original = subtracted"""
163163
return [item for item in original if item not in subtracted]
164164

165165

@@ -175,7 +175,7 @@ def list_extract_keys(lst, key_name):
175175

176176

177177
def is_intermediate(field):
178-
'''Encapsulates the check to see if a field is an intermediate field or a "real" field.'''
178+
"""Encapsulates the check to see if a field is an intermediate field or a "real" field."""
179179
return ('intermediate' in field['field_details'] and field['field_details']['intermediate'])
180180

181181

scripts/generators/es_template.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -263,13 +263,13 @@ def default_mapping_settings():
263263

264264

265265
def es6_type_fallback(mappings):
266-
'''
266+
"""
267267
Visits each leaf in mappings object and fallback to an
268268
Elasticsearch 6.x supported type.
269269
270270
Since a field like `wildcard` won't have the same defaults as
271271
a `keyword` field, we must add any missing defaults.
272-
'''
272+
"""
273273

274274
for (name, details) in mappings.items():
275275
if 'type' in details:

scripts/generators/intermediate_files.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,15 @@ def generate(fields, out_dir, default_dirs):
2020

2121

2222
def generate_flat_fields(fields):
23-
'''Generate ecs_flat.yml'''
23+
"""Generate ecs_flat.yml"""
2424
filtered = remove_non_root_reusables(fields)
2525
flattened = {}
2626
visitor.visit_fields_with_memo(filtered, accumulate_field, flattened)
2727
return flattened
2828

2929

3030
def accumulate_field(details, memo):
31-
'''Visitor function that accumulates all field details in the memo dict'''
31+
"""Visitor function that accumulates all field details in the memo dict"""
3232
if 'schema_details' in details or ecs_helpers.is_intermediate(details):
3333
return
3434
field_details = copy.deepcopy(details['field_details'])
@@ -39,7 +39,7 @@ def accumulate_field(details, memo):
3939

4040

4141
def generate_nested_fields(fields):
42-
'''Generate ecs_nested.yml'''
42+
"""Generate ecs_nested.yml"""
4343
nested = {}
4444
# Flatten each field set, but keep all resulting fields nested under their
4545
# parent/host field set.
@@ -71,13 +71,13 @@ def generate_nested_fields(fields):
7171

7272

7373
def remove_internal_attributes(field_details):
74-
'''Remove attributes only relevant to the deeply nested structure, but not to ecs_flat/nested.yml.'''
74+
"""Remove attributes only relevant to the deeply nested structure, but not to ecs_flat/nested.yml."""
7575
field_details.pop('node_name', None)
7676
field_details.pop('intermediate', None)
7777

7878

7979
def remove_non_root_reusables(fields_nested):
80-
'''
80+
"""
8181
Remove field sets that have top_level=false from the root of the field definitions.
8282
8383
This attribute means they're only meant to be in the "reusable/expected" locations
@@ -87,7 +87,7 @@ def remove_non_root_reusables(fields_nested):
8787
still needs to keep all field sets at the root of the YAML file, as it
8888
the official information about each field set. It's the responsibility of
8989
users consuming ecs_nested.yml to skip the field sets with top_level=false.
90-
'''
90+
"""
9191
fields = {}
9292
for (name, field) in fields_nested.items():
9393
if 'reusable' not in field['schema_details'] or field['schema_details']['reusable']['top_level']:

scripts/schema/cleaner.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def schema_cleanup(schema):
5656

5757

5858
def schema_mandatory_attributes(schema):
59-
'''Ensures for the presence of the mandatory schema attributes and raises if any are missing'''
59+
"""Ensures for the presence of the mandatory schema attributes and raises if any are missing"""
6060
current_schema_attributes = sorted(list(schema['field_details'].keys()) +
6161
list(schema['schema_details'].keys()))
6262
missing_attributes = ecs_helpers.list_subtract(SCHEMA_MANDATORY_ATTRIBUTES, current_schema_attributes)
@@ -74,7 +74,7 @@ def schema_mandatory_attributes(schema):
7474

7575

7676
def schema_assertions_and_warnings(schema):
77-
'''Additional checks on a fleshed out schema'''
77+
"""Additional checks on a fleshed out schema"""
7878
single_line_short_description(schema, strict=strict_mode)
7979
if 'beta' in schema['field_details']:
8080
single_line_beta_description(schema, strict=strict_mode)
@@ -143,7 +143,7 @@ def field_defaults(field):
143143

144144

145145
def field_or_multi_field_datatype_defaults(field_details):
146-
'''Sets datatype-related defaults on a canonical field or multi-field entries.'''
146+
"""Sets datatype-related defaults on a canonical field or multi-field entries."""
147147
if field_details['type'] == 'keyword':
148148
field_details.setdefault('ignore_above', 1024)
149149
if field_details['type'] == 'text':
@@ -160,7 +160,7 @@ def field_or_multi_field_datatype_defaults(field_details):
160160

161161

162162
def field_mandatory_attributes(field):
163-
'''Ensures for the presence of the mandatory field attributes and raises if any are missing'''
163+
"""Ensures for the presence of the mandatory field attributes and raises if any are missing"""
164164
if ecs_helpers.is_intermediate(field):
165165
return
166166
current_field_attributes = sorted(field['field_details'].keys())
@@ -180,7 +180,7 @@ def field_mandatory_attributes(field):
180180

181181

182182
def field_assertions_and_warnings(field):
183-
'''Additional checks on a fleshed out field'''
183+
"""Additional checks on a fleshed out field"""
184184
if not ecs_helpers.is_intermediate(field):
185185
# check short description length if in strict mode
186186
single_line_short_description(field, strict=strict_mode)

scripts/schema/exclude_filter.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
from schema import loader
2+
3+
# This script should be run downstream of the subset filters - it takes
4+
# all ECS and custom fields already loaded by the latter and explicitly
5+
# removes a subset, for example, to simulate impact of future removals
6+
7+
8+
def exclude(fields, exclude_file_globs):
9+
excludes = load_exclude_definitions(exclude_file_globs)
10+
11+
if excludes:
12+
fields = exclude_fields(fields, excludes)
13+
14+
return fields
15+
16+
17+
def long_path(path_as_list):
18+
return '.'.join([e for e in path_as_list])
19+
20+
21+
def pop_field(fields, node_path, path, removed):
22+
"""pops a field from yaml derived dict using path derived from ordered list of nodes"""
23+
if node_path[0] in fields:
24+
if len(node_path) == 1:
25+
flat_name = long_path(path)
26+
fields.pop(node_path[0])
27+
return flat_name
28+
else:
29+
inner_field = node_path.pop(0)
30+
if 'fields' in fields[inner_field]:
31+
popped = pop_field(fields[inner_field]['fields'], node_path, path, removed)
32+
# if object field with no remaining fields and not 'base', pop it
33+
if fields[inner_field]['fields'] == {} and inner_field != 'base':
34+
fields.pop(inner_field)
35+
return popped
36+
else:
37+
raise ValueError(
38+
'--exclude specified, but no path to field {} found'.format(long_path(path)))
39+
else:
40+
this_long_path = long_path(path)
41+
# Check in case already removed parent
42+
if not any([this_long_path.startswith(long_path) for long_path in removed if long_path != None]):
43+
raise ValueError('--exclude specified, but no field {} found'.format(this_long_path))
44+
45+
46+
def exclude_trace_path(fields, item, path, removed):
47+
"""traverses paths to one or more nodes in a yaml derived dict"""
48+
for list_item in item:
49+
node_path = path.copy()
50+
# cater for name.with.dots
51+
for name in list_item['name'].split('.'):
52+
node_path.append(name)
53+
if not 'fields' in list_item:
54+
parent = node_path[0]
55+
removed.append(pop_field(fields, node_path, node_path.copy(), removed))
56+
# if parent field has no remaining fields and not 'base', pop it
57+
if parent != 'base' and parent in fields and len(fields[parent]['fields']) == 0:
58+
fields.pop(parent)
59+
else:
60+
raise ValueError('--exclude specified, can\'t parse fields in file {}'.format(item))
61+
62+
63+
def exclude_fields(fields, excludes):
64+
"""Traverses fields and eliminates any field which matches the excludes"""
65+
if excludes:
66+
for ex_list in excludes:
67+
for item in ex_list:
68+
exclude_trace_path(fields, item['fields'], [item['name']], [])
69+
return fields
70+
71+
72+
def load_exclude_definitions(file_globs):
73+
if not file_globs:
74+
return []
75+
excludes = loader.load_definitions(file_globs)
76+
if not excludes:
77+
raise ValueError('--exclude specified, but no exclusions found in {}'.format(file_globs))
78+
return excludes

scripts/schema/finalizer.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020

2121
def finalize(fields):
22-
'''Intended entrypoint of the finalizer.'''
22+
"""Intended entrypoint of the finalizer."""
2323
perform_reuse(fields)
2424
calculate_final_values(fields)
2525

@@ -46,7 +46,7 @@ def order_reuses(fields):
4646

4747

4848
def perform_reuse(fields):
49-
'''Performs field reuse in two phases'''
49+
"""Performs field reuse in two phases"""
5050
foreign_reuses, self_nestings = order_reuses(fields)
5151

5252
# Phase 1: foreign reuse
@@ -99,11 +99,11 @@ def perform_reuse(fields):
9999

100100

101101
def ensure_valid_reuse(reused_schema, destination_schema=None):
102-
'''
102+
"""
103103
Raise if either the reused schema or destination schema have root=true.
104104
105105
Second param is optional, if testing for a self-nesting (where source=destination).
106-
'''
106+
"""
107107
if reused_schema['schema_details']['root']:
108108
msg = "Schema {} has attribute root=true and therefore cannot be reused.".format(
109109
reused_schema['field_details']['name'])
@@ -115,7 +115,7 @@ def ensure_valid_reuse(reused_schema, destination_schema=None):
115115

116116

117117
def append_reused_here(reused_schema, reuse_entry, destination_schema):
118-
'''Captures two ways of denoting what field sets are reused under a given field set'''
118+
"""Captures two ways of denoting what field sets are reused under a given field set"""
119119
# Legacy, too limited
120120
destination_schema['schema_details'].setdefault('nestings', [])
121121
destination_schema['schema_details']['nestings'] = sorted(
@@ -136,15 +136,15 @@ def append_reused_here(reused_schema, reuse_entry, destination_schema):
136136

137137

138138
def set_original_fieldset(fields, original_fieldset):
139-
'''Recursively set the 'original_fieldset' attribute for all fields in a group of fields'''
139+
"""Recursively set the 'original_fieldset' attribute for all fields in a group of fields"""
140140
def func(details):
141141
# Don't override if already set (e.g. 'group' for user.group.* fields)
142142
details['field_details'].setdefault('original_fieldset', original_fieldset)
143143
visitor.visit_fields(fields, field_func=func)
144144

145145

146146
def field_group_at_path(dotted_path, fields):
147-
'''Returns the ['fields'] hash at the dotted_path.'''
147+
"""Returns the ['fields'] hash at the dotted_path."""
148148
path = dotted_path.split('.')
149149
nesting = fields
150150
for next_field in path:
@@ -163,17 +163,17 @@ def field_group_at_path(dotted_path, fields):
163163

164164

165165
def calculate_final_values(fields):
166-
'''
166+
"""
167167
This function navigates all fields recursively.
168168
169169
It populates a few more values for the fields, especially path-based values
170170
like flat_name.
171-
'''
171+
"""
172172
visitor.visit_fields_with_path(fields, field_finalizer)
173173

174174

175175
def field_finalizer(details, path):
176-
'''This is the function called by the visitor to perform the work of calculate_final_values'''
176+
"""This is the function called by the visitor to perform the work of calculate_final_values"""
177177
name_array = path + [details['field_details']['node_name']]
178178
flat_name = '.'.join(name_array)
179179
details['field_details']['flat_name'] = flat_name

0 commit comments

Comments
 (0)