Skip to content

Commit 5f52979

Browse files
author
Andrew Stucki
authored
[Processors] Mime-Type Detection (#22940)
* Add mimetype processor * Add mimetype detection for packetbeat * Update changelog * Rev go.sum * Refactor for reusability and rename to detect_mime_type * reformat imports * update docs * Update maxHeaderSize name and add comment on the fallback behavior
1 parent 7c64f53 commit 5f52979

16 files changed

Lines changed: 392 additions & 6 deletions

File tree

CHANGELOG.next.asciidoc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -540,6 +540,7 @@ https://github.com/elastic/beats/compare/v7.0.0-alpha2...master[Check the HEAD d
540540
- Added support for wildcard fields and keyword fallback in beats setup commands. {pull}22521[22521]
541541
- Fix polling node when it is not ready and monitor by hostname {pull}22666[22666]
542542
- Improve equals check. {pull}22778[22778]
543+
- Added "detect_mime_type" processor for detecting mime types {pull}22940[22940]
543544

544545
*Auditbeat*
545546

@@ -870,6 +871,7 @@ https://github.com/elastic/beats/compare/v7.0.0-alpha2...master[Check the HEAD d
870871
- Add support for overriding the published index on a per-protocol/flow basis. {pull}22134[22134]
871872
- Change build process for x-pack distribution {pull}21979[21979]
872873
- Tuned the internal queue size to reduce the chances of events being dropped. {pull}22650[22650]
874+
- Add support for "http.request.mime_type" and "http.response.mime_type". {pull}22940[22940]
873875

874876
*Functionbeat*
875877

NOTICE.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10142,11 +10142,11 @@ Contents of probable licence file $GOMODCACHE/github.com/gorhill/cronexpr@v0.0.0
1014210142

1014310143
--------------------------------------------------------------------------------
1014410144
Dependency : github.com/h2non/filetype
10145-
Version: v1.0.12
10145+
Version: v1.1.1-0.20201130172452-f60988ab73d5
1014610146
Licence type (autodetected): MIT
1014710147
--------------------------------------------------------------------------------
1014810148

10149-
Contents of probable licence file $GOMODCACHE/github.com/h2non/filetype@v1.0.12/LICENSE:
10149+
Contents of probable licence file $GOMODCACHE/github.com/h2non/filetype@v1.1.1-0.20201130172452-f60988ab73d5/LICENSE:
1015010150

1015110151
The MIT License
1015210152

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ require (
9797
github.com/gorilla/mux v1.7.2 // indirect
9898
github.com/gorilla/websocket v1.4.1 // indirect
9999
github.com/grpc-ecosystem/grpc-gateway v1.13.0 // indirect
100-
github.com/h2non/filetype v1.0.12
100+
github.com/h2non/filetype v1.1.1-0.20201130172452-f60988ab73d5
101101
github.com/hashicorp/go-multierror v1.1.0
102102
github.com/hashicorp/go-retryablehttp v0.6.6
103103
github.com/hashicorp/golang-lru v0.5.2-0.20190520140433-59383c442f7d // indirect

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -416,8 +416,8 @@ github.com/gorilla/websocket v1.4.1/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/ad
416416
github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA=
417417
github.com/grpc-ecosystem/grpc-gateway v1.13.0 h1:sBDQoHXrOlfPobnKw69FIKa1wg9qsLLvvQ/Y19WtFgI=
418418
github.com/grpc-ecosystem/grpc-gateway v1.13.0/go.mod h1:8XEsbTttt/W+VvjtQhLACqCisSPWTxCZ7sBRjU6iH9c=
419-
github.com/h2non/filetype v1.0.12 h1:yHCsIe0y2cvbDARtJhGBTD2ecvqMSTvlIcph9En/Zao=
420-
github.com/h2non/filetype v1.0.12/go.mod h1:319b3zT68BvV+WRj7cwy856M2ehB3HqNOt6sy1HndBY=
419+
github.com/h2non/filetype v1.1.1-0.20201130172452-f60988ab73d5 h1:xI88renBpIJws9OfEQq4Dng10OppnY5u9bTok/GDFEI=
420+
github.com/h2non/filetype v1.1.1-0.20201130172452-f60988ab73d5/go.mod h1:319b3zT68BvV+WRj7cwy856M2ehB3HqNOt6sy1HndBY=
421421
github.com/hashicorp/errwrap v0.0.0-20141028054710-7554cd9344ce/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
422422
github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA=
423423
github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=

libbeat/docs/processors-list.asciidoc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,9 @@ endif::[]
6262
ifndef::no_decompress_gzip_field_processor[]
6363
* <<decompress-gzip-field,`decompress_gzip_field`>>
6464
endif::[]
65+
ifndef::no_detect_mime_type_processor[]
66+
* <<detect-mime-type,`detect_mime_type`>>
67+
endif::[]
6568
ifndef::no_dissect_processor[]
6669
* <<dissect, `dissect`>>
6770
endif::[]
@@ -168,6 +171,9 @@ endif::[]
168171
ifndef::no_decompress_gzip_field_processor[]
169172
include::{libbeat-processors-dir}/actions/docs/decompress_gzip_field.asciidoc[]
170173
endif::[]
174+
ifndef::no_detect_mime_type_processor[]
175+
include::{libbeat-processors-dir}/actions/docs/detect_mime_type.asciidoc[]
176+
endif::[]
171177
ifndef::no_dissect_processor[]
172178
include::{libbeat-processors-dir}/dissect/docs/dissect.asciidoc[]
173179
endif::[]

libbeat/mime/byte.go

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
// Licensed to Elasticsearch B.V. under one or more contributor
2+
// license agreements. See the NOTICE file distributed with
3+
// this work for additional information regarding copyright
4+
// ownership. Elasticsearch B.V. licenses this file to you under
5+
// the Apache License, Version 2.0 (the "License"); you may
6+
// not use this file except in compliance with the License.
7+
// You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
package mime
19+
20+
import (
21+
"encoding/json"
22+
"encoding/xml"
23+
"net/http"
24+
"strings"
25+
26+
"github.com/h2non/filetype"
27+
)
28+
29+
const (
30+
// size for mime detection, office file
31+
// detection requires ~8kb to detect properly
32+
maxHeaderSize = 8192
33+
)
34+
35+
// DetectBytes tries to detect a mime-type based off
36+
// of a chunk of bytes passed into the function
37+
func DetectBytes(data []byte) string {
38+
header := data
39+
if len(data) > maxHeaderSize {
40+
header = data[:maxHeaderSize]
41+
}
42+
kind, err := filetype.Match(header)
43+
if err == nil && kind != filetype.Unknown {
44+
// we have a known filetype, return
45+
return kind.MIME.Value
46+
}
47+
// if the above fails, try and sniff with http sniffing
48+
netType := http.DetectContentType(header)
49+
// try and parse any sort of text as json or xml
50+
if strings.HasPrefix(netType, "text/plain") {
51+
if detected := detectEncodedText(data); detected != "" {
52+
return detected
53+
}
54+
}
55+
// The fallback for http.DetectContentType is "application/octet-stream"
56+
// meaning that if we see it, we were unable to determine the type and
57+
// we just know we're dealing with a chunk of some sort of bytes. Rather
58+
// than reporting the fallback, we'll just say we were unable to detect
59+
// the type.
60+
if netType == "application/octet-stream" {
61+
return ""
62+
}
63+
return netType
64+
}
65+
66+
func detectEncodedText(data []byte) string {
67+
// figure out how to optimize this so we don't have to try and parse the whole payload
68+
// every time
69+
if json.Valid(data) {
70+
return "application/json"
71+
}
72+
if xml.Unmarshal(data, new(interface{})) == nil {
73+
return "text/xml"
74+
}
75+
return ""
76+
}

libbeat/mime/mime_test.go

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
// Licensed to Elasticsearch B.V. under one or more contributor
2+
// license agreements. See the NOTICE file distributed with
3+
// this work for additional information regarding copyright
4+
// ownership. Elasticsearch B.V. licenses this file to you under
5+
// the Apache License, Version 2.0 (the "License"); you may
6+
// not use this file except in compliance with the License.
7+
// You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
package mime
19+
20+
import (
21+
"encoding/hex"
22+
"testing"
23+
24+
"github.com/stretchr/testify/require"
25+
)
26+
27+
func TestMimeType(t *testing.T) {
28+
tests := []struct {
29+
name string
30+
expectedType string
31+
body string
32+
}{
33+
{
34+
name: "html",
35+
expectedType: "text/html; charset=utf-8",
36+
body: "<html>Test</html>",
37+
},
38+
{
39+
name: "pe",
40+
expectedType: "application/vnd.microsoft.portable-executable",
41+
body: convertToData(t, "4d5a90000300000004000000ffff"),
42+
},
43+
{
44+
name: "elf",
45+
expectedType: "application/x-executable",
46+
body: convertToData(t, "7f454c460101010000000000000000000300030001000000f0dc01003400000080a318000000000034002000080028001e001d0001"),
47+
},
48+
{
49+
name: "macho",
50+
expectedType: "application/x-mach-binary",
51+
body: convertToData(t, "cffaedfe0700000103000000020000001000000058050000850020000000000019000000480000005f5f504147455a45524f"),
52+
},
53+
{
54+
name: "json",
55+
expectedType: "application/json",
56+
body: "{}",
57+
},
58+
{
59+
name: "xml",
60+
expectedType: "text/xml",
61+
body: "<test></test>",
62+
},
63+
{
64+
name: "text",
65+
expectedType: "text/plain; charset=utf-8",
66+
body: "Hello world!",
67+
},
68+
{
69+
name: "png",
70+
expectedType: "image/png",
71+
body: convertToData(t, "89504e470d0a1a0a0000000d494844520000025800000258080200000031040f8b0000000467414d410000b18f0bfc610500"),
72+
},
73+
}
74+
for _, test := range tests {
75+
t.Run(test.name, func(t *testing.T) {
76+
require.Equal(t, test.expectedType, Detect(test.body))
77+
})
78+
}
79+
}
80+
81+
func convertToData(t *testing.T, sample string) string {
82+
t.Helper()
83+
decoded, err := hex.DecodeString(sample)
84+
if err != nil {
85+
t.Fatal(err)
86+
}
87+
return string(decoded)
88+
}

libbeat/mime/string.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
// Licensed to Elasticsearch B.V. under one or more contributor
2+
// license agreements. See the NOTICE file distributed with
3+
// this work for additional information regarding copyright
4+
// ownership. Elasticsearch B.V. licenses this file to you under
5+
// the Apache License, Version 2.0 (the "License"); you may
6+
// not use this file except in compliance with the License.
7+
// You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
package mime
19+
20+
// Detect tries to detect a mime-type based off
21+
// of a byte string passed into the function
22+
func Detect(data string) string {
23+
return DetectBytes([]byte(data))
24+
}
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
// Licensed to Elasticsearch B.V. under one or more contributor
2+
// license agreements. See the NOTICE file distributed with
3+
// this work for additional information regarding copyright
4+
// ownership. Elasticsearch B.V. licenses this file to you under
5+
// the Apache License, Version 2.0 (the "License"); you may
6+
// not use this file except in compliance with the License.
7+
// You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
package actions
19+
20+
import (
21+
"fmt"
22+
23+
"github.com/pkg/errors"
24+
25+
"github.com/elastic/beats/v7/libbeat/beat"
26+
"github.com/elastic/beats/v7/libbeat/common"
27+
"github.com/elastic/beats/v7/libbeat/mime"
28+
"github.com/elastic/beats/v7/libbeat/processors"
29+
"github.com/elastic/beats/v7/libbeat/processors/checks"
30+
)
31+
32+
func init() {
33+
processors.RegisterPlugin("detect_mime_type",
34+
checks.ConfigChecked(NewDetectMimeType,
35+
checks.RequireFields("field", "target"),
36+
checks.AllowedFields("field", "target")))
37+
}
38+
39+
type mimeTypeProcessor struct {
40+
Field string `config:"field"`
41+
Target string `config:"target"`
42+
}
43+
44+
// NewDetectMimeType constructs a new mime processor.
45+
func NewDetectMimeType(cfg *common.Config) (processors.Processor, error) {
46+
mimeType := &mimeTypeProcessor{}
47+
if err := cfg.Unpack(mimeType); err != nil {
48+
return nil, errors.Wrapf(err, "fail to unpack the detect_mime_type configuration")
49+
}
50+
51+
return mimeType, nil
52+
}
53+
54+
func (m *mimeTypeProcessor) Run(event *beat.Event) (*beat.Event, error) {
55+
valI, err := event.GetValue(m.Field)
56+
if err != nil {
57+
// doesn't have the required fieldd value to analyze
58+
return event, nil
59+
}
60+
val, _ := valI.(string)
61+
if val == "" {
62+
// wrong type or not set
63+
return event, nil
64+
}
65+
if mimeType := mime.Detect(val); mimeType != "" {
66+
event.Fields.DeepUpdate(common.MapStr{
67+
m.Target: mimeType,
68+
})
69+
}
70+
return event, nil
71+
}
72+
73+
func (m *mimeTypeProcessor) String() string {
74+
return fmt.Sprintf("detect_mime_type=%+v->%+v", m.Field, m.Target)
75+
}
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
// Licensed to Elasticsearch B.V. under one or more contributor
2+
// license agreements. See the NOTICE file distributed with
3+
// this work for additional information regarding copyright
4+
// ownership. Elasticsearch B.V. licenses this file to you under
5+
// the Apache License, Version 2.0 (the "License"); you may
6+
// not use this file except in compliance with the License.
7+
// You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
package actions
19+
20+
import (
21+
"testing"
22+
23+
"github.com/stretchr/testify/require"
24+
25+
"github.com/elastic/beats/v7/libbeat/beat"
26+
"github.com/elastic/beats/v7/libbeat/common"
27+
)
28+
29+
func TestMimeTypeFromTo(t *testing.T) {
30+
evt := beat.Event{
31+
Fields: common.MapStr{
32+
"foo.bar.baz": "hello world!",
33+
},
34+
}
35+
p, err := NewDetectMimeType(common.MustNewConfigFrom(map[string]interface{}{
36+
"field": "foo.bar.baz",
37+
"target": "bar.baz.zoiks",
38+
}))
39+
require.NoError(t, err)
40+
observed, err := p.Run(&evt)
41+
require.NoError(t, err)
42+
enriched, err := observed.Fields.GetValue("bar.baz.zoiks")
43+
require.NoError(t, err)
44+
require.Equal(t, "text/plain; charset=utf-8", enriched)
45+
}
46+
47+
func TestMimeTypeTestNoMatch(t *testing.T) {
48+
evt := beat.Event{
49+
Fields: common.MapStr{
50+
"foo.bar.baz": string([]byte{0, 0}),
51+
},
52+
}
53+
p, err := NewDetectMimeType(common.MustNewConfigFrom(map[string]interface{}{
54+
"field": "foo.bar.baz",
55+
"target": "bar.baz.zoiks",
56+
}))
57+
require.NoError(t, err)
58+
observed, err := p.Run(&evt)
59+
require.NoError(t, err)
60+
hasKey, _ := observed.Fields.HasKey("bar.baz.zoiks")
61+
require.False(t, hasKey)
62+
}

0 commit comments

Comments
 (0)