[AWS][Cloudfront Logs] - Implemented GROK processor based ipv6/v4 par…

…sing in AWS Cloudfront Logs data stream (elastic#11829) * The regex pattern used for the ipv6 was overly complex and caused errors inElasticsearch. This change simplifies and fixes the grok-based IPv4/IPv4 parsing by adding a new helper pipeline invoked iteratively over ip fields via a foreach. Also add a painless script to handle 'localhost' literal addresses. * Also run elastic-package format.
harnish-elastic · Nov 25, 2024 · 29e226c · 29e226c
1 parent 60f7b87
commit 29e226c
Show file tree

Hide file tree

Showing 43 changed files with 192 additions and 240 deletions.
diff --git a/packages/aws/changelog.yml b/packages/aws/changelog.yml
@@ -1,4 +1,12 @@
 # newer versions go on top
+- version: "2.32.0"
+  changes:
+    - description: Implemented grok processor based parsing for ipv6 & ipv4 addresses in the AWS CloudFront logs.
+      type: bugfix
+      link: https://github.com/elastic/integrations/pull/11829
+    - description: Auto formatted various text descriptions and newlines across all data streams via elastic-package.
+      type: enhancement
+      link: https://github.com/elastic/integrations/pull/11829
 - version: "2.31.4"
   changes:
     - description: Update documentation with required permissions for AWS Inspector.

diff --git a/packages/aws/data_stream/apigateway_logs/manifest.yml b/packages/aws/data_stream/apigateway_logs/manifest.yml
@@ -86,12 +86,8 @@ streams:
         type: integer
         title: "[SQS] Maximum Concurrent SQS Messages"
         description: >
-          The maximum number of SQS messages that can be inflight at any time. Defaults to 5. When processing large
-          amount of large size S3 objects and each object has large amount of events, if this parameter sets too high,
-          it can cause the input to process too many messages concurrently, overload the agent and cause ingest failure.
-          We recommend to keep the default value 5 and use the [preset](https://www.elastic.co/guide/en/fleet/current/es-output-settings.html#es-output-settings-performance-tuning-settings)
-          option to tune your Elastic Agent performance. You can optimize for throughput, scale, latency, or you can
-          choose a balanced (the default) set of performance specifications.
+          The maximum number of SQS messages that can be inflight at any time. Defaults to 5. When processing large amount of large size S3 objects and each object has large amount of events, if this parameter sets too high, it can cause the input to process too many messages concurrently, overload the agent and cause ingest failure. We recommend to keep the default value 5 and use the [preset](https://www.elastic.co/guide/en/fleet/current/es-output-settings.html#es-output-settings-performance-tuning-settings) option to tune your Elastic Agent performance. You can optimize for throughput, scale, latency, or you can choose a balanced (the default) set of performance specifications.
+
         default: 5
         required: false
         show_user: false

diff --git a/packages/aws/data_stream/apigateway_metrics/fields/package-fields.yml b/packages/aws/data_stream/apigateway_metrics/fields/package-fields.yml
@@ -7,7 +7,5 @@
         Tag key-value pairs from AWS resources.
     - name: metrics_names_fingerprint
       type: keyword
-      description: |
-        Autogenerated ID representing the fingerprint of the list of metrics names. 
-        Applicable only for [Amazon Data Firehose integration](https://www.elastic.co/docs/current/integrations/awsfirehose).
+      description: "Autogenerated ID representing the fingerprint of the list of metrics names. \nApplicable only for [Amazon Data Firehose integration](https://www.elastic.co/docs/current/integrations/awsfirehose).\n"
       dimension: true
diff --git a/...ages/aws/data_stream/cloudfront_logs/_dev/test/pipeline/test-cloudfront.log-expected.json b/...ages/aws/data_stream/cloudfront_logs/_dev/test/pipeline/test-cloudfront.log-expected.json
@@ -1336,4 +1336,4 @@
             }
         }
     ]
-}
+}
diff --git a/packages/aws/data_stream/cloudfront_logs/elasticsearch/ingest_pipeline/default.yml b/packages/aws/data_stream/cloudfront_logs/elasticsearch/ingest_pipeline/default.yml
@@ -209,48 +209,48 @@ processors:
       source: >-
         ctx.event.duration = (Long)(Float.parseFloat(ctx._tmp.time_taken) * params.S_TO_NS);
   # x-forwarded-for
+  - split:
+      field: _tmp.x_forwarded_for
+      separator: ","
+      target_field: _tmp.split_x_forwarded_for
+      if: ctx._tmp?.x_forwarded_for != null && ctx._tmp.x_forwarded_for != '-'
   - script:
       lang: painless
-      description: Parse x-forwarded-for ip-addresses. Add invalid values to an error string without failing.
-      if: ctx._tmp?.x_forwarded_for != null && ctx._tmp.x_forwarded_for != '-'
+      description: trim leading and trailing whitespace from the split IPs 
+      if: ctx._tmp?.split_x_forwarded_for != null
       source: |
-        ArrayList ips = new ArrayList();
-        ArrayList bad_ips = new ArrayList();
-        def raw_ips = ctx._tmp?.x_forwarded_for.splitOnToken(',');
-
-        // Patterns used for ip validation, inspired by logstash. Used as a workaround due to lack of a simpler method.
-        Pattern ipv4 = /^(?<![0-9])(?:(?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5]))(?![0-9])$/;
-        Pattern ipv6 = /^((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)??$/;
-
-        // Sanitize all ip strings
-        for (String raw_item : raw_ips ) {
-          String item = raw_item.trim();
-
-          if (ipv4.matcher(item).matches() || ipv6.matcher(item).matches()) {
-            ips.add(item);
-          }
-          // Edge case observed in the wild. E.g. 'localhost:8081'
-          else if (item.startsWith('localhost')) {
-            ips.add('127.0.0.1')
-          }
-          else {
-            bad_ips.add(item);
-          }
-        }
-
-        // Workaround due to lack of support of dotted fields
-        if (ctx.get('network') == null) {
-          ctx['network'] = new HashMap();
-        }
-        ctx['network']['forwarded_ip'] = ips;
-
-        if (bad_ips.length > 0 ) {
-          ctx['_tmp']['error_message'] = "Invalid ip addresses: " + bad_ips.join(',');
+        for (int i = 0; i < ctx._tmp.split_x_forwarded_for.length; i++) {
+          ctx._tmp.split_x_forwarded_for[i] = ctx._tmp.split_x_forwarded_for[i].trim();
         }
+  - foreach:
+      field: _tmp.split_x_forwarded_for
+      if: ctx._tmp.split_x_forwarded_for instanceof List
+      processor:
+        pipeline:
+          name: '{{ IngestPipeline "pipeline_process_ip" }}'
+          tag: pipeline_process_ip
+          ignore_missing_pipeline: true
   - append:
       field: error.message
-      value: "{{{ _tmp.error_message }}}"
-      if: ctx._tmp?.error_message != null
+      value: "Invalid IP addresses: {{_tmp.invalid_ips}}"
+      if: ctx._tmp.invalid_ips != null
+  - script:
+      lang: painless
+      description: Handle 'localhost' edge case, currently not handled via grok
+      if: ctx._tmp?.split_x_forwarded_for != null && ctx._tmp.split_x_forwarded_for != '-'
+      source: |
+          if (ctx.get('network') == null) {
+              ctx['network'] = new HashMap();
+            }
+          for (String item : ctx._tmp.split_x_forwarded_for ) {
+            // edge case observed in the wild. e.g. 'localhost:8081'
+            if (item.startsWith('localhost')) {
+            if (ctx.network.forwarded_ip == null) {
+              ctx['network']['forwarded_ip'] = new ArrayList();
+            }
+              ctx['network']['forwarded_ip'].add('127.0.0.1');
+            }
+          }
   - foreach:
       field: network.forwarded_ip
       processor:

diff --git a/...ges/aws/data_stream/cloudfront_logs/elasticsearch/ingest_pipeline/pipeline_process_ip.yml b/...ges/aws/data_stream/cloudfront_logs/elasticsearch/ingest_pipeline/pipeline_process_ip.yml
@@ -0,0 +1,33 @@
+---
+description: Pipeline for triggering the grok processor and helper scripts to effectively parse and append individual ipv6/v4 addresses.
+
+processors:
+  - grok:
+      field: _ingest._value
+      tag: grok_parse_x_forwarded_for_ipv4_and_ipv6_addresses
+      ignore_missing: true
+      ignore_failure: true
+      patterns:
+        - "^%{IPV4:_tmp.valid_ip}$"
+        - "^%{IPV6:_tmp.valid_ip}$"
+        - "^%{IPV6NOCOMPRESS:_tmp.valid_ip}$"
+        - "^\\[%{IPV6:_tmp.valid_ip}\\]$"
+      pattern_definitions:
+        IPV6NOCOMPRESS: '([0-9A-Fa-f]{1,4}:){7}[0-9A-Fa-f]{1,4}'
+      on_failure:
+        - set:
+            field: _tmp.invalid_ip
+            value: "{{_ingest._value}}"
+        - append:
+            field: error.message
+            value: 'Processor {{{_ingest.on_failure_processor_type}}} with tag {{{_ingest.on_failure_processor_tag}}} in pipeline {{{_ingest.on_failure_pipeline}}} failed with message: {{{_ingest.on_failure_message}}}'
+  - append:
+      field: network.forwarded_ip
+      value: "{{_tmp.valid_ip}}"
+      if: ctx._tmp.valid_ip != null && ctx._tmp.valid_ip != ""
+      allow_duplicates: false
+  - append:
+      field: _tmp.invalid_ips
+      value: "{{_tmp.invalid_ip}}"
+      if: ctx._tmp.invalid_ip != null && ctx._tmp.invalid_ip != ""
+      allow_duplicates: false
diff --git a/packages/aws/data_stream/cloudfront_logs/manifest.yml b/packages/aws/data_stream/cloudfront_logs/manifest.yml
@@ -124,12 +124,8 @@ streams:
         type: integer
         title: "[SQS] Maximum Concurrent SQS Messages"
         description: >
-          The maximum number of SQS messages that can be inflight at any time. Defaults to 5. When processing large
-          amount of large size S3 objects and each object has large amount of events, if this parameter sets too high,
-          it can cause the input to process too many messages concurrently, overload the agent and cause ingest failure.
-          We recommend to keep the default value 5 and use the [preset](https://www.elastic.co/guide/en/fleet/current/es-output-settings.html#es-output-settings-performance-tuning-settings)
-          option to tune your Elastic Agent performance. You can optimize for throughput, scale, latency, or you can
-          choose a balanced (the default) set of performance specifications.
+          The maximum number of SQS messages that can be inflight at any time. Defaults to 5. When processing large amount of large size S3 objects and each object has large amount of events, if this parameter sets too high, it can cause the input to process too many messages concurrently, overload the agent and cause ingest failure. We recommend to keep the default value 5 and use the [preset](https://www.elastic.co/guide/en/fleet/current/es-output-settings.html#es-output-settings-performance-tuning-settings) option to tune your Elastic Agent performance. You can optimize for throughput, scale, latency, or you can choose a balanced (the default) set of performance specifications.
+
         default: 5
         required: false
         show_user: false

diff --git a/packages/aws/data_stream/cloudfront_logs/sample_event.json b/packages/aws/data_stream/cloudfront_logs/sample_event.json
@@ -1,11 +1,11 @@
 {
     "@timestamp": "2019-12-04T21:02:31.000Z",
     "agent": {
-        "ephemeral_id": "2e56d54c-2c59-4b67-8f1a-41cf7dbd9d08",
-        "id": "acba78ef-1401-4689-977c-d8c2e5d6a8fa",
-        "name": "docker-fleet-agent",
+        "ephemeral_id": "ea8d1f6d-f155-460c-9a81-17e4b2e25281",
+        "id": "4303444e-1f0c-42c6-981a-73737910b81c",
+        "name": "elastic-agent-18453",
         "type": "filebeat",
-        "version": "8.10.1"
+        "version": "8.16.0"
     },
     "aws": {
         "cloudfront": {
@@ -18,8 +18,8 @@
         },
         "s3": {
             "bucket": {
-                "arn": "arn:aws:s3:::elastic-package-aws-bucket-58094",
-                "name": "elastic-package-aws-bucket-58094"
+                "arn": "arn:aws:s3:::elastic-package-aws-bucket-20687",
+                "name": "elastic-package-aws-bucket-20687"
             },
             "object": {
                 "key": "cloudfront"
@@ -32,7 +32,7 @@
     },
     "data_stream": {
         "dataset": "aws.cloudfront_logs",
-        "namespace": "ep",
+        "namespace": "66517",
         "type": "logs"
     },
     "destination": {
@@ -43,18 +43,19 @@
         "version": "8.11.0"
     },
     "elastic_agent": {
-        "id": "acba78ef-1401-4689-977c-d8c2e5d6a8fa",
+        "id": "4303444e-1f0c-42c6-981a-73737910b81c",
         "snapshot": false,
-        "version": "8.10.1"
+        "version": "8.16.0"
     },
     "event": {
         "agent_id_status": "verified",
         "category": [
             "web"
         ],
         "dataset": "aws.cloudfront_logs",
+        "duration": 1000000,
         "id": "SOX4xwn4XV6Q4rgb7XiVGOHms_BGlTAC4KyHmureZmBNrjGdRLiNIQ==",
-        "ingested": "2023-11-03T13:01:05Z",
+        "ingested": "2024-11-23T12:54:04Z",
         "kind": "event",
         "original": "2019-12-04\t21:02:31\tLAX1\t392\t89.160.20.112\tGET\td111111abcdef8.cloudfront.net\t/index.html\t200\t-\tMozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/78.0.3904.108%20Safari/537.36\t-\t-\tHit\tSOX4xwn4XV6Q4rgb7XiVGOHms_BGlTAC4KyHmureZmBNrjGdRLiNIQ==\td111111abcdef8.cloudfront.net\thttps\t23\t0.001\t-\tTLSv1.2\tECDHE-RSA-AES128-GCM-SHA256\tHit\tHTTP/2.0\t-\t-\t11040\t0.001\tHit\ttext/html\t78\t-\t-",
         "outcome": "success",
@@ -83,7 +84,7 @@
     },
     "log": {
         "file": {
-            "path": "https://elastic-package-aws-bucket-58094.s3.us-east-1.amazonaws.com/cloudfront"
+            "path": "https://elastic-package-aws-bucket-20687.s3.us-east-1.amazonaws.com/cloudfront"
         },
         "offset": 471
     },
@@ -155,4 +156,4 @@
         },
         "version": "78.0.3904.108"
     }
-}
+}
diff --git a/packages/aws/data_stream/cloudtrail/fields/fields.yml b/packages/aws/data_stream/cloudtrail/fields/fields.yml
@@ -188,9 +188,5 @@
           description: >-
             Additional insight details.
 - name: related.entity
-  description: |
-   A collection of all entity identifiers associated with the document. 
-   If the document  contains multiple entities, identifiers for each will be included.
-   Example identifiers include(but not limited to) cloud resource IDs, ARNs,  email addresses,
-   and hostnames. 
-  type: keyword
+  description: "A collection of all entity identifiers associated with the document. \nIf the document  contains multiple entities, identifiers for each will be included.\nExample identifiers include(but not limited to) cloud resource IDs, ARNs,  email addresses,\nand hostnames. \n"
+  type: keyword
diff --git a/packages/aws/data_stream/cloudtrail/manifest.yml b/packages/aws/data_stream/cloudtrail/manifest.yml
@@ -145,12 +145,8 @@ streams:
         type: integer
         title: "[SQS] Maximum Concurrent SQS Messages"
         description: >
-          The maximum number of SQS messages that can be inflight at any time. Defaults to 5. When processing large
-          amount of large size S3 objects and each object has large amount of events, if this parameter sets too high,
-          it can cause the input to process too many messages concurrently, overload the agent and cause ingest failure.
-          We recommend to keep the default value 5 and use the [preset](https://www.elastic.co/guide/en/fleet/current/es-output-settings.html#es-output-settings-performance-tuning-settings)
-          option to tune your Elastic Agent performance. You can optimize for throughput, scale, latency, or you can
-          choose a balanced (the default) set of performance specifications.
+          The maximum number of SQS messages that can be inflight at any time. Defaults to 5. When processing large amount of large size S3 objects and each object has large amount of events, if this parameter sets too high, it can cause the input to process too many messages concurrently, overload the agent and cause ingest failure. We recommend to keep the default value 5 and use the [preset](https://www.elastic.co/guide/en/fleet/current/es-output-settings.html#es-output-settings-performance-tuning-settings) option to tune your Elastic Agent performance. You can optimize for throughput, scale, latency, or you can choose a balanced (the default) set of performance specifications.
+
         default: 5
         required: false
       - name: custom

diff --git a/packages/aws/data_stream/dynamodb/fields/package-fields.yml b/packages/aws/data_stream/dynamodb/fields/package-fields.yml
@@ -7,7 +7,5 @@
         Tag key value pairs from aws resources.
     - name: metrics_names_fingerprint
       type: keyword
-      description: |
-        Autogenerated ID representing the fingerprint of the list of metrics names. 
-        Applicable only for [Amazon Data Firehose integration](https://www.elastic.co/docs/current/integrations/awsfirehose).
+      description: "Autogenerated ID representing the fingerprint of the list of metrics names. \nApplicable only for [Amazon Data Firehose integration](https://www.elastic.co/docs/current/integrations/awsfirehose).\n"
       dimension: true
diff --git a/packages/aws/data_stream/ebs/fields/package-fields.yml b/packages/aws/data_stream/ebs/fields/package-fields.yml
@@ -11,7 +11,5 @@
         Name of a S3 bucket.
     - name: metrics_names_fingerprint
       type: keyword
-      description: |
-        Autogenerated ID representing the fingerprint of the list of metrics names. 
-        Applicable only for [Amazon Data Firehose integration](https://www.elastic.co/docs/current/integrations/awsfirehose).
+      description: "Autogenerated ID representing the fingerprint of the list of metrics names. \nApplicable only for [Amazon Data Firehose integration](https://www.elastic.co/docs/current/integrations/awsfirehose).\n"
       dimension: true
diff --git a/packages/aws/data_stream/ec2_logs/manifest.yml b/packages/aws/data_stream/ec2_logs/manifest.yml
@@ -120,12 +120,8 @@ streams:
         type: integer
         title: "[SQS] Maximum Concurrent SQS Messages"
         description: >
-          The maximum number of SQS messages that can be inflight at any time. Defaults to 5. When processing large
-          amount of large size S3 objects and each object has large amount of events, if this parameter sets too high,
-          it can cause the input to process too many messages concurrently, overload the agent and cause ingest failure.
-          We recommend to keep the default value 5 and use the [preset](https://www.elastic.co/guide/en/fleet/current/es-output-settings.html#es-output-settings-performance-tuning-settings)
-          option to tune your Elastic Agent performance. You can optimize for throughput, scale, latency, or you can
-          choose a balanced (the default) set of performance specifications.
+          The maximum number of SQS messages that can be inflight at any time. Defaults to 5. When processing large amount of large size S3 objects and each object has large amount of events, if this parameter sets too high, it can cause the input to process too many messages concurrently, overload the agent and cause ingest failure. We recommend to keep the default value 5 and use the [preset](https://www.elastic.co/guide/en/fleet/current/es-output-settings.html#es-output-settings-performance-tuning-settings) option to tune your Elastic Agent performance. You can optimize for throughput, scale, latency, or you can choose a balanced (the default) set of performance specifications.
+
         default: 5
         required: false
         show_user: false

diff --git a/packages/aws/data_stream/ec2_metrics/_dev/test/pipeline/test-ec2-metrics-from-firehose.json b/packages/aws/data_stream/ec2_metrics/_dev/test/pipeline/test-ec2-metrics-from-firehose.json
@@ -1,21 +1,21 @@
 {
-  "events": [
-    {
-      "@timestamp": "2024-08-06T22:25:00.000Z",
-      "agent.type": "firehose",
-      "aws.cloudwatch.namespace": "AWS/EC2",
-      "aws.ec2.metrics.CPUUtilization.avg": 21.96,
-      "cloud.account.id": "123456789012",
-      "cloud.instance.id": "i-08b34b681949f9fab",
-      "cloud.provider": "aws",
-      "cloud.region": "us-east-1",
-      "data_stream.dataset": "aws.ec2_metrics",
-      "data_stream.namespace": "default",
-      "data_stream.type": "metrics",
-      "ecs.version": "8.11.0",
-      "event.agent_id_status": "missing",
-      "event.ingested": "2024-08-06T22:26:25Z",
-      "event.module": "aws"
-    }
-  ]
+    "events": [
+        {
+            "@timestamp": "2024-08-06T22:25:00.000Z",
+            "agent.type": "firehose",
+            "aws.cloudwatch.namespace": "AWS/EC2",
+            "aws.ec2.metrics.CPUUtilization.avg": 21.96,
+            "cloud.account.id": "123456789012",
+            "cloud.instance.id": "i-08b34b681949f9fab",
+            "cloud.provider": "aws",
+            "cloud.region": "us-east-1",
+            "data_stream.dataset": "aws.ec2_metrics",
+            "data_stream.namespace": "default",
+            "data_stream.type": "metrics",
+            "ecs.version": "8.11.0",
+            "event.agent_id_status": "missing",
+            "event.ingested": "2024-08-06T22:26:25Z",
+            "event.module": "aws"
+        }
+    ]
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -1336,4 +1336,4 @@ @@
                 }
             }
         ]
-    }
+    }