From c0b5d00e60ed90e0574cebb95824d23e19c808d5 Mon Sep 17 00:00:00 2001
From: Andrew Stucki <andrew.stucki@elastic.co>
Date: Tue, 19 Jan 2021 11:27:34 -0500
Subject: [PATCH] Update suricata integration with wildcard fields

---
 .../suricata/data_stream/eve/fields/agent.yml | 40 +++--------
 .../data_stream/eve/fields/fields-epr.yml     | 70 +++++--------------
 packages/suricata/docs/README.md              | 10 +--
 packages/suricata/manifest.yml                |  8 +--
 4 files changed, 37 insertions(+), 91 deletions(-)

diff --git a/packages/suricata/data_stream/eve/fields/agent.yml b/packages/suricata/data_stream/eve/fields/agent.yml
index da4e652c53b..777ab188e3c 100644
--- a/packages/suricata/data_stream/eve/fields/agent.yml
+++ b/packages/suricata/data_stream/eve/fields/agent.yml
@@ -2,16 +2,14 @@
   title: Cloud
   group: 2
   description: Fields related to the cloud or infrastructure the events are coming from.
-  footnote: 'Examples: If Metricbeat is running on an EC2 host and fetches data from its host, the cloud info contains the data about this machine. If Metricbeat runs on a remote machine outside the cloud and fetches data from a service running in the cloud, the field contains cloud data from the machine the service is running on.'
+  footnote: "Examples: If Metricbeat is running on an EC2 host and fetches data from its host, the cloud info contains the data about this machine. If Metricbeat runs on a remote machine outside the cloud and fetches data from a service running in the cloud, the field contains cloud data from the machine the service is running on."
   type: group
   fields:
     - name: account.id
       level: extended
       type: keyword
       ignore_above: 1024
-      description: 'The cloud account or organization id used to identify different entities in a multi-tenant environment.
-
-        Examples: AWS account id, Google Cloud ORG Id, or other unique identifier.'
+      description: "The cloud account or organization id used to identify different entities in a multi-tenant environment.\nExamples: AWS account id, Google Cloud ORG Id, or other unique identifier."
       example: 666777888999
     - name: availability_zone
       level: extended
@@ -57,9 +55,7 @@
 - name: container
   title: Container
   group: 2
-  description: 'Container fields are used for meta information about the specific container that is the source of information.
-
-    These fields help correlate data based containers from any runtime.'
+  description: "Container fields are used for meta information about the specific container that is the source of information.\nThese fields help correlate data based containers from any runtime."
   type: group
   fields:
     - name: id
@@ -85,9 +81,7 @@
 - name: host
   title: Host
   group: 2
-  description: 'A host is defined as a general computing instance.
-
-    ECS host.* fields should be populated with details about the host on which the event happened, or from which the measurement was taken. Host types include hardware, virtual machines, Docker containers, and Kubernetes nodes.'
+  description: "A host is defined as a general computing instance.\nECS host.* fields should be populated with details about the host on which the event happened, or from which the measurement was taken. Host types include hardware, virtual machines, Docker containers, and Kubernetes nodes."
   type: group
   fields:
     - name: architecture
@@ -100,27 +94,19 @@
       level: extended
       type: keyword
       ignore_above: 1024
-      description: 'Name of the domain of which the host is a member.
-
-        For example, on Windows this could be the host''s Active Directory domain or NetBIOS domain name. For Linux this could be the domain of the host''s LDAP provider.'
+      description: "Name of the domain of which the host is a member.\nFor example, on Windows this could be the host's Active Directory domain or NetBIOS domain name. For Linux this could be the domain of the host's LDAP provider."
       example: CONTOSO
       default_field: false
     - name: hostname
       level: core
-      type: keyword
+      type: wildcard
       ignore_above: 1024
-      description: 'Hostname of the host.
-
-        It normally contains what the `hostname` command returns on the host machine.'
+      description: "Hostname of the host.\nIt normally contains what the `hostname` command returns on the host machine."
     - name: id
       level: core
       type: keyword
       ignore_above: 1024
-      description: 'Unique host id.
-
-        As hostname is not always unique, use values that are meaningful in your environment.
-
-        Example: The current usage of `beat.name`.'
+      description: "Unique host id.\nAs hostname is not always unique, use values that are meaningful in your environment.\nExample: The current usage of `beat.name`."
     - name: ip
       level: core
       type: ip
@@ -134,9 +120,7 @@
       level: core
       type: keyword
       ignore_above: 1024
-      description: 'Name of the host.
-
-        It can contain what `hostname` returns on Unix systems, the fully qualified domain name, or a name specified by the user. The sender decides which value to use.'
+      description: "Name of the host.\nIt can contain what `hostname` returns on Unix systems, the fully qualified domain name, or a name specified by the user. The sender decides which value to use."
     - name: os.family
       level: extended
       type: keyword
@@ -151,7 +135,7 @@
       example: 4.4.0-112-generic
     - name: os.name
       level: extended
-      type: keyword
+      type: wildcard
       ignore_above: 1024
       multi_fields:
         - name: text
@@ -176,9 +160,7 @@
       level: core
       type: keyword
       ignore_above: 1024
-      description: 'Type of host.
-
-        For Cloud providers this can be the machine type like `t2.medium`. If vm, this could be the container, for example, or other information meaningful in your environment.'
+      description: "Type of host.\nFor Cloud providers this can be the machine type like `t2.medium`. If vm, this could be the container, for example, or other information meaningful in your environment."
     - name: containerized
       type: boolean
       description: >
diff --git a/packages/suricata/data_stream/eve/fields/fields-epr.yml b/packages/suricata/data_stream/eve/fields/fields-epr.yml
index 0c3265df7d3..373e6a926aa 100644
--- a/packages/suricata/data_stream/eve/fields/fields-epr.yml
+++ b/packages/suricata/data_stream/eve/fields/fields-epr.yml
@@ -1,56 +1,36 @@
 - name: event
   title: Event
   group: 2
-  description: 'The event fields are used for context information about the log or metric event itself.
-
-    A log is defined as an event containing details of something that happened. Log events must include the time at which the thing happened. Examples of log events include a process starting on a host, a network packet being sent from a source to a destination, or a network connection between a client and a server being initiated or closed. A metric is defined as an event containing one or more numerical measurements and the time at which the measurement was taken. Examples of metric events include memory pressure measured on a host and device temperature. See the `event.kind` definition in this section for additional details about metric and state events.'
+  description: "The event fields are used for context information about the log or metric event itself.\nA log is defined as an event containing details of something that happened. Log events must include the time at which the thing happened. Examples of log events include a process starting on a host, a network packet being sent from a source to a destination, or a network connection between a client and a server being initiated or closed. A metric is defined as an event containing one or more numerical measurements and the time at which the measurement was taken. Examples of metric events include memory pressure measured on a host and device temperature. See the `event.kind` definition in this section for additional details about metric and state events."
   type: group
   fields:
     - name: created
       level: core
       type: date
-      description: 'event.created contains the date/time when the event was first read by an agent, or by your pipeline.
-
-        This field is distinct from @timestamp in that @timestamp typically contain the time extracted from the original event.
-
-        In most situations, these two timestamps will be slightly different. The difference can be used to calculate the delay between your source generating an event, and the time when your agent first processed it. This can be used to monitor your agent''s or pipeline''s ability to keep up with your event source.
-
-        In case the two timestamps are identical, @timestamp should be used.'
-      example: '2016-05-23T08:05:34.857Z'
+      description: "event.created contains the date/time when the event was first read by an agent, or by your pipeline.\nThis field is distinct from @timestamp in that @timestamp typically contain the time extracted from the original event.\nIn most situations, these two timestamps will be slightly different. The difference can be used to calculate the delay between your source generating an event, and the time when your agent first processed it. This can be used to monitor your agent's or pipeline's ability to keep up with your event source.\nIn case the two timestamps are identical, @timestamp should be used."
+      example: "2016-05-23T08:05:34.857Z"
     - name: ingested
       level: core
       type: date
-      description: 'Timestamp when an event arrived in the central data store.
-
-        This is different from `@timestamp`, which is when the event originally occurred.  It''s also different from `event.created`, which is meant to capture the first time an agent saw the event.
-
-        In normal conditions, assuming no tampering, the timestamps should chronologically look like this: `@timestamp` < `event.created` < `event.ingested`.'
-      example: '2016-05-23T08:05:35.101Z'
+      description: "Timestamp when an event arrived in the central data store.\nThis is different from `@timestamp`, which is when the event originally occurred.  It's also different from `event.created`, which is meant to capture the first time an agent saw the event.\nIn normal conditions, assuming no tampering, the timestamps should chronologically look like this: `@timestamp` < `event.created` < `event.ingested`."
+      example: "2016-05-23T08:05:35.101Z"
     - name: original
       level: core
       type: keyword
       ignore_above: 1024
-      description: 'Raw text message of entire event. Used to demonstrate log integrity.
-
-        This field is not indexed and doc_values are disabled. It cannot be searched, but it can be retrieved from `_source`.'
+      description: "Raw text message of entire event. Used to demonstrate log integrity.\nThis field is not indexed and doc_values are disabled. It cannot be searched, but it can be retrieved from `_source`."
       example: Sep 19 08:26:10 host CEF:0&#124;Security&#124; threatmanager&#124;1.0&#124;100&#124; worm successfully stopped&#124;10&#124;src=10.0.0.1 dst=2.1.2.2spt=1232
 - name: dns
   title: DNS
   group: 2
-  description: 'Fields describing DNS queries and answers.
-
-    DNS events should either represent a single DNS query prior to getting answers (`dns.type:query`) or they should represent a full exchange and contain the query details as well as all of the answers that were provided for this query (`dns.type:answer`).'
+  description: "Fields describing DNS queries and answers.\nDNS events should either represent a single DNS query prior to getting answers (`dns.type:query`) or they should represent a full exchange and contain the query details as well as all of the answers that were provided for this query (`dns.type:answer`)."
   type: group
   fields:
     - name: answers
       level: extended
       type: object
       object_type: keyword
-      description: 'An array containing an object for each answer section returned by the server.
-
-        The main keys that should be present in these objects are defined by ECS. Records that have more information may contain more keys than what ECS defines.
-
-        Not all DNS data sources give all details about DNS answers. At minimum, answer objects must contain the `data` key. If more information is available, map as much of it to ECS as possible, and add any additional fields to the answer objects as custom fields.'
+      description: "An array containing an object for each answer section returned by the server.\nThe main keys that should be present in these objects are defined by ECS. Records that have more information may contain more keys than what ECS defines.\nNot all DNS data sources give all details about DNS answers. At minimum, answer objects must contain the `data` key. If more information is available, map as much of it to ECS as possible, and add any additional fields to the answer objects as custom fields."
     - name: answers.class
       level: extended
       type: keyword
@@ -59,19 +39,15 @@
       example: IN
     - name: answers.data
       level: extended
-      type: keyword
+      type: wildcard
       ignore_above: 1024
-      description: 'The data describing the resource.
-
-        The meaning of this data depends on the type and class of the resource record.'
+      description: "The data describing the resource.\nThe meaning of this data depends on the type and class of the resource record."
       example: 10.10.10.10
     - name: answers.name
       level: extended
       type: keyword
       ignore_above: 1024
-      description: 'The domain name to which this resource record pertains.
-
-        If a chain of CNAME is being resolved, each answer''s `name` should be the one that corresponds with the answer''s `data`. It should not simply be the original `question.name` repeated.'
+      description: "The domain name to which this resource record pertains.\nIf a chain of CNAME is being resolved, each answer's `name` should be the one that corresponds with the answer's `data`. It should not simply be the original `question.name` repeated."
       example: www.google.com
     - name: answers.ttl
       level: extended
@@ -88,9 +64,7 @@
       level: extended
       type: keyword
       ignore_above: 1024
-      description: 'Array of 2 letter DNS header flags.
-
-        Expected values are: AA, TC, RD, RA, AD, CD, DO.'
+      description: "Array of 2 letter DNS header flags.\nExpected values are: AA, TC, RD, RA, AD, CD, DO."
       example:
         - RD
         - RA
@@ -114,7 +88,7 @@
       example: IN
     - name: question.name
       level: extended
-      type: keyword
+      type: wildcard
       ignore_above: 1024
       description: 'The name being queried.
 
@@ -155,9 +129,7 @@
     - name: resolved_ip
       level: extended
       type: ip
-      description: 'Array containing all IPs seen in `answers.data`.
-
-        The `answers` array can be difficult to use, because of the variety of data formats it can contain. Extracting all IP addresses seen in there to `dns.resolved_ip` makes it possible to index them as IP addresses, and makes them easier to visualize and query for.'
+      description: "Array containing all IPs seen in `answers.data`.\nThe `answers` array can be difficult to use, because of the variety of data formats it can contain. Extracting all IP addresses seen in there to `dns.resolved_ip` makes it possible to index them as IP addresses, and makes them easier to visualize and query for."
       example:
         - 10.10.10.10
         - 10.10.10.11
@@ -171,20 +143,12 @@
       level: extended
       type: keyword
       ignore_above: 1024
-      description: 'The type of DNS event captured, query or answer.
-
-        If your source of DNS events only gives you DNS queries, you should only create dns events of type `dns.type:query`.
-
-        If your source of DNS events gives you answers as well, you should create one event per query (optionally as soon as the query is seen). And a second event containing all query details as well as an array of answers.'
+      description: "The type of DNS event captured, query or answer.\nIf your source of DNS events only gives you DNS queries, you should only create dns events of type `dns.type:query`.\nIf your source of DNS events gives you answers as well, you should create one event per query (optionally as soon as the query is seen). And a second event containing all query details as well as an array of answers."
       example: answer
 - name: related
   title: Related
   group: 2
-  description: 'This field set is meant to facilitate pivoting around a piece of data.
-
-    Some pieces of information can be seen in many places in an ECS event. To facilitate searching for them, store an array of all seen values to their corresponding field in `related.`.
-
-    A concrete example is IP addresses, which can be under host, observer, source, destination, client, server, and network.forwarded_ip. If you append all IPs to `related.ip`, you can then search for a given IP trivially, no matter where it appeared, by querying `related.ip:192.0.2.15`.'
+  description: "This field set is meant to facilitate pivoting around a piece of data.\nSome pieces of information can be seen in many places in an ECS event. To facilitate searching for them, store an array of all seen values to their corresponding field in `related.`.\nA concrete example is IP addresses, which can be under host, observer, source, destination, client, server, and network.forwarded_ip. If you append all IPs to `related.ip`, you can then search for a given IP trivially, no matter where it appeared, by querying `related.ip:192.0.2.15`."
   type: group
   fields:
     - name: ip
@@ -195,7 +159,7 @@
   type: keyword
   description: Filebeat input type used to collect the log.
 - name: log.file.path
-  type: keyword
+  type: wildcard
   description: >
     The file from which the line was read. This field contains the absolute path to the file. For example: `/var/log/system.log`.
 
diff --git a/packages/suricata/docs/README.md b/packages/suricata/docs/README.md
index adf00f6bc35..87b1cdb8258 100644
--- a/packages/suricata/docs/README.md
+++ b/packages/suricata/docs/README.md
@@ -49,7 +49,7 @@ with other versions of Suricata.
 | destination.port | Port of the destination. | long |
 | dns.answers | An array containing an object for each answer section returned by the server. The main keys that should be present in these objects are defined by ECS. Records that have more information may contain more keys than what ECS defines. Not all DNS data sources give all details about DNS answers. At minimum, answer objects must contain the `data` key. If more information is available, map as much of it to ECS as possible, and add any additional fields to the answer objects as custom fields. | object |
 | dns.answers.class | The class of DNS data contained in this resource record. | keyword |
-| dns.answers.data | The data describing the resource. The meaning of this data depends on the type and class of the resource record. | keyword |
+| dns.answers.data | The data describing the resource. The meaning of this data depends on the type and class of the resource record. | wildcard |
 | dns.answers.name | The domain name to which this resource record pertains. If a chain of CNAME is being resolved, each answer's `name` should be the one that corresponds with the answer's `data`. It should not simply be the original `question.name` repeated. | keyword |
 | dns.answers.ttl | The time interval in seconds that this resource record may be cached before it should be discarded. Zero values mean that the data should not be cached. | long |
 | dns.answers.type | The type of data contained in this resource record. | keyword |
@@ -57,7 +57,7 @@ with other versions of Suricata.
 | dns.id | The DNS packet identifier assigned by the program that generated the query. The identifier is copied to the response. | keyword |
 | dns.op_code | The DNS operation code that specifies the kind of query in the message. This value is set by the originator of a query and copied into the response. | keyword |
 | dns.question.class | The class of records being queried. | keyword |
-| dns.question.name | The name being queried. If the name field contains non-printable characters (below 32 or above 126), those characters should be represented as escaped base 10 integers (\DDD). Back slashes and quotes should be escaped. Tabs, carriage returns, and line feeds should be converted to \t, \r, and \n respectively. | keyword |
+| dns.question.name | The name being queried. If the name field contains non-printable characters (below 32 or above 126), those characters should be represented as escaped base 10 integers (\DDD). Back slashes and quotes should be escaped. Tabs, carriage returns, and line feeds should be converted to \t, \r, and \n respectively. | wildcard |
 | dns.question.registered_domain | The highest registered domain, stripped of the subdomain. For example, the registered domain for "foo.google.com" is "google.com". This value can be determined precisely with a list like the public suffix list (http://publicsuffix.org). Trying to approximate this by simply taking the last two labels will not work well for TLDs such as "co.uk". | keyword |
 | dns.question.subdomain | The subdomain is all of the labels under the registered_domain. If the domain has multiple levels of subdomain, such as "sub2.sub1.example.com", the subdomain field should contain "sub2.sub1", with no trailing period. | keyword |
 | dns.question.top_level_domain | The effective top level domain (eTLD), also known as the domain suffix, is the last part of the domain name. For example, the top level domain for google.com is "com". This value can be determined precisely with a list like the public suffix list (http://publicsuffix.org). Trying to approximate this by simply taking the last label will not work well for effective TLDs such as "co.uk". | keyword |
@@ -79,7 +79,7 @@ with other versions of Suricata.
 | host.architecture | Operating system architecture. | keyword |
 | host.containerized | If the host is a container. | boolean |
 | host.domain | Name of the domain of which the host is a member. For example, on Windows this could be the host's Active Directory domain or NetBIOS domain name. For Linux this could be the domain of the host's LDAP provider. | keyword |
-| host.hostname | Hostname of the host. It normally contains what the `hostname` command returns on the host machine. | keyword |
+| host.hostname | Hostname of the host. It normally contains what the `hostname` command returns on the host machine. | wildcard |
 | host.id | Unique host id. As hostname is not always unique, use values that are meaningful in your environment. Example: The current usage of `beat.name`. | keyword |
 | host.ip | Host ip addresses. | ip |
 | host.mac | Host mac addresses. | keyword |
@@ -88,7 +88,7 @@ with other versions of Suricata.
 | host.os.codename | OS codename, if any. | keyword |
 | host.os.family | OS family (such as redhat, debian, freebsd, windows). | keyword |
 | host.os.kernel | Operating system kernel version as a raw string. | keyword |
-| host.os.name | Operating system name, without the version. | keyword |
+| host.os.name | Operating system name, without the version. | wildcard |
 | host.os.platform | Operating system platform (such centos, ubuntu, windows). | keyword |
 | host.os.version | Operating system version as a raw string. | keyword |
 | host.type | Type of host. For Cloud providers this can be the machine type like `t2.medium`. If vm, this could be the container, for example, or other information meaningful in your environment. | keyword |
@@ -97,7 +97,7 @@ with other versions of Suricata.
 | http.response.body.bytes | Size in bytes of the response body. | long |
 | http.response.status_code | HTTP response status code. | long |
 | input.type | Filebeat input type used to collect the log. | keyword |
-| log.file.path | The file from which the line was read. This field contains the absolute path to the file. For example: `/var/log/system.log`. | keyword |
+| log.file.path | The file from which the line was read. This field contains the absolute path to the file. For example: `/var/log/system.log`. | wildcard |
 | log.offset | The file offset the reported line starts at. | long |
 | message | Log message optimized for viewing in a log viewer. | text |
 | network.bytes | Total bytes transferred in both directions. | long |
diff --git a/packages/suricata/manifest.yml b/packages/suricata/manifest.yml
index 53240eb08fc..1c4a1cede69 100644
--- a/packages/suricata/manifest.yml
+++ b/packages/suricata/manifest.yml
@@ -1,6 +1,6 @@
 name: suricata
 title: Suricata
-version: 0.3.4
+version: 0.4.0
 release: experimental
 description: Suricata Integration
 type: integration
@@ -13,7 +13,7 @@ format_version: 1.0.0
 license: basic
 categories: [network, security]
 conditions:
-  kibana.version: '>=7.10.0'
+  kibana.version: ">=7.10.0"
 screenshots:
   - src: /img/filebeat-suricata-events.png
     title: filebeat suricata events
@@ -38,7 +38,7 @@ policy_templates:
             show_user: true
             default:
               - suricata
-        title: 'Collect Suricata eve logs (input: logfile)'
-        description: 'Collecting eve logs from Suricata instances (input: logfile)'
+        title: "Collect Suricata eve logs (input: logfile)"
+        description: "Collecting eve logs from Suricata instances (input: logfile)"
 owner:
   github: elastic/security-external-integrations