Abnormal and normal pipeline termination scenarios added. YAML config validation logic covers dynamic (pipeline-id) keyword.

mashhurs · mashhurs · commit d26c0484b668 · 2024-10-04T00:27:14.000-07:00
diff --git a/.buildkite/scripts/health-report-tests/bootstrap.py b/.buildkite/scripts/health-report-tests/bootstrap.py
@@ -70,30 +70,32 @@ def build_logstash(self):
         print(f"Logstash has successfully built.")
 
     def apply_config(self, config: dict) -> None:
-        with open(os.getcwd() + "/config/pipelines.yml", 'w') as pipelines_file:
+        with open(os.getcwd() + "/.buildkite/scripts/health-report-tests/config/pipelines.yml", 'w') as pipelines_file:
             yaml.dump(config, pipelines_file)
 
-    def run_logstash(self) -> subprocess.Popen:
-        process = subprocess.Popen(["bin/logstash"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    def run_logstash(self, full_start_required: bool) -> subprocess.Popen:
+        # --config.reload.automatic is to make instance active
+        # it is helpful when testing crash pipeline cases
+        config_path = os.getcwd() + "/.buildkite/scripts/health-report-tests/config"
+        process = subprocess.Popen(["bin/logstash", "--config.reload.automatic", "--path.settings", config_path,
+                                    "-w 1"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=False)
         if process.poll() is not None:
             print(f"Logstash failed to run, check the the config and logs, then rerun.")
             return None
 
         # Read stdout and stderr in real-time
         logs = []
         for stdout_line in iter(process.stdout.readline, ""):
-            # print("STDOUT:", stdout_line.strip())
             logs.append(stdout_line.strip())
-            if "Starting pipeline" in stdout_line:
+            # we don't wait for Logstash fully start as we also test slow pipeline start scenarios
+            if full_start_required is False and "Starting pipeline" in stdout_line:
                 break
-            if "Logstash shut down" in stdout_line:
+            if full_start_required is True and "Pipeline started" in stdout_line:
+                break
+            if "Logstash shut down" in stdout_line or "Logstash stopped" in stdout_line:
                 print(f"Logstash couldn't spin up.")
                 print(logs)
                 return None
 
         print(f"Logstash is running with PID: {process.pid}.")
         return process
-
-    def stop_logstash(self, process: subprocess.Popen) -> None:
-        process.terminate()
-        print(f"Stopping Logstash...")
diff --git a/.buildkite/scripts/health-report-tests/config/pipelines.yml b/.buildkite/scripts/health-report-tests/config/pipelines.yml
@@ -0,0 +1 @@
+# Intentionally left blank
diff --git a/.buildkite/scripts/health-report-tests/config_validator.py b/.buildkite/scripts/health-report-tests/config_validator.py
@@ -1,35 +1,48 @@
 import yaml
-from typing import Any, List, Dict, Union
+from typing import Any, List, Dict
 
 
 class ConfigValidator:
-    REQUIRED_KEYS: Dict[str, List[Any]] = {
+    REQUIRED_KEYS = {
+        "root": ["name", "config", "conditions", "expectation"],
         "config": ["pipeline.id", "config.string"],
-        "expectation": ["status", "symptom", {"diagnosis": ["cause"]},
-                        {"impacts": ["description", "impact_areas"], "details": ["run_state"]}]
+        "conditions": ["full_start_required"],
+        "expectation": ["status", "symptom", "indicators"],
+        "indicators": ["pipelines"],
+        "pipelines": ["status", "symptom", "indicators"],
+        "DYNAMIC": ["status", "symptom", "diagnosis", "impacts", "details"],
+        "details": ["status"],
+        "status": ["state"]
     }
 
     def __init__(self):
         self.yaml_content = None
 
-    def __validate_keys(self, yaml_sub_keys: List[Dict[str, Any]], required_sub_keys: Dict[str, List[Any]]) -> bool:
-        for required_sub_key in required_sub_keys:
-            if isinstance(required_sub_key, str):
-                is_key_found = False
-                for yaml_sub_key in yaml_sub_keys:
-                    if yaml_sub_key.get(required_sub_key):
-                        is_key_found = True
-                        break
-                if not is_key_found:
-                    print(f"Required {required_sub_key} key is not found in {yaml_sub_keys}")
-                    return False
-        return True
+    def __has_valid_keys(self, data: any, key_path: str, repeated: bool) -> bool:
+        if isinstance(data, str) or isinstance(data, bool):   # we reached values
+            return True
 
-    def __check_nested_key(self, data: Dict[str, Any], nested_key: str) -> bool:
-        keys = nested_key.split('.')
-        for key in keys:
-            if key not in data:
-                return False
+        # we have two indicators section and for the next repeated ones, we go deeper
+        first_key = next(iter(data))
+        data = data[first_key] if repeated and key_path == "indicators" else data
+
+        if isinstance(data, dict):
+            # pipeline-id is a DYNAMIC
+            required = self.REQUIRED_KEYS.get("DYNAMIC" if repeated and key_path == "indicators" else key_path, [])
+            repeated = not repeated if key_path == "indicators" else repeated
+            for key in required:
+                if key not in data:
+                    print(f"Missing key '{key}' in '{key_path}'")
+                    return False
+                else:
+                    dic_keys_result = self.__has_valid_keys(data[key], key, repeated)
+                    if dic_keys_result is False:
+                        return False
+        elif isinstance(data, list):
+            for item in data:
+                list_keys_result = self.__has_valid_keys(item, key_path, repeated)
+                if list_keys_result is False:
+                    return False
         return True
 
     def load(self, file_path: str) -> None:
@@ -48,18 +61,9 @@ def is_valid(self) -> bool:
             print(f"YAML content is empty.")
             return False
 
-        if not isinstance(self.yaml_content, Dict):
+        if not isinstance(self.yaml_content, dict):
             print(f"YAML structure is not as expected, it should start with a Dict.")
             return False
 
-        required_config_keys = list(self.REQUIRED_KEYS.keys())
-        for yaml_key in self.yaml_content:
-            if yaml_key == "name":
-                continue
-            if yaml_key not in required_config_keys:
-                return False
-            if not self.__validate_keys(self.yaml_content.get(yaml_key), self.REQUIRED_KEYS.get(yaml_key)):
-                return False
-
-        print(f"YAML config validation succeeded.")
-        return True
+        result = self.__has_valid_keys(self.yaml_content, "root", False)
+        return True if result is True else False
diff --git a/.buildkite/scripts/health-report-tests/logstash_stats.py b/.buildkite/scripts/health-report-tests/logstash_stats.py
diff --git a/.buildkite/scripts/health-report-tests/main.py b/.buildkite/scripts/health-report-tests/main.py
@@ -3,10 +3,12 @@
 """
 import glob
 import os
+import time
+import traceback
+import yaml
 from bootstrap import Bootstrap
 from scenario_executor import ScenarioExecutor
 from config_validator import ConfigValidator
-import yaml
 
 
 class BootstrapContextManager:
@@ -29,9 +31,9 @@ def __enter__(self):
         print(f"logstash-integration-failure_injector successfully installed.")
         return self.bootstrap
 
-    def __exit__(self, exc_type, exc_value, traceback):
+    def __exit__(self, exc_type, exc_value, exc_traceback):
         if exc_type is not None:
-            traceback.print_exception(exc_type, exc_value, traceback)
+            print(traceback.format_exception(exc_type, exc_value, exc_traceback))
 
 
 def main():
@@ -46,24 +48,39 @@ def main():
         for scenario_file in scenario_files:
             print(f"Validating {scenario_file} scenario file.")
             config_validator.load(scenario_file)
-            if not config_validator.is_valid():
+            if config_validator.is_valid() is False:
                 print(f"{scenario_file} scenario file is not valid.")
                 return
+            else:
+                print(f"Validation succeeded.")
 
+        has_failed_scenario = False
         for scenario_file in scenario_files:
             with open(scenario_file, 'r') as file:
                 # scenario_content: Dict[str, Any] = None
                 scenario_content = yaml.safe_load(file)
+                print(f"Testing `{scenario_content.get('name')}` scenario.")
                 scenario_name = scenario_content['name']
+
+                is_full_start_required = next(sub.get('full_start_required') for sub in
+                                              scenario_content.get('conditions') if 'full_start_required' in sub)
                 config = scenario_content['config']
                 if config is not None:
                     bootstrap.apply_config(config)
-                    expectation = scenario_content['expectation']
-                    process = bootstrap.run_logstash()
+                    expectations = scenario_content.get("expectation")
+                    process = bootstrap.run_logstash(is_full_start_required)
                     if process is not None:
-                        scenario_executor.on(scenario_name, expectation)
+                        try:
+                            scenario_executor.on(scenario_name, expectations)
+                        except Exception as e:
+                            print(e)
+                            has_failed_scenario = True
                         process.terminate()
-                    break
+                        time.sleep(5)   # leave some window to terminate the process
+
+        if has_failed_scenario:
+            # intentionally fail due to visibility
+            raise Exception("Some of scenarios failed, check the log for details.")
 
 
 if __name__ == "__main__":
diff --git a/.buildkite/scripts/health-report-tests/requirements.txt b/.buildkite/scripts/health-report-tests/requirements.txt
@@ -1,3 +1,2 @@
 requests==2.32.3
-deepdiff==8.0.1
 pyyaml==6.0.2
diff --git a/.buildkite/scripts/health-report-tests/scenario_executor.py b/.buildkite/scripts/health-report-tests/scenario_executor.py
@@ -1,7 +1,7 @@
 """
 A class to execute the given scenario for Logstash Health Report integration test
 """
-from deepdiff import DeepDiff
+import time
 from logstash_health_report import LogstashHealthReport
 
 
@@ -11,33 +11,55 @@ class ScenarioExecutor:
     def __init__(self):
         pass
 
-    def __is_expected(self, scenario_content: list) -> None:
-        logstash_health = self.logstash_health_report_api.get()
-        print(f"Logstash health report: {logstash_health}")
+    def __has_intersection(self, expects, results):
+        # we expect expects to be existing in results
+        for expect in expects:
+            for result in results:
+                if not all(key in result and result[key] == value for key, value in expect.items()):
+                    return False
+        return True
 
-        differences = []
-        for index, item in enumerate(scenario_content):
-            if "expectation" in item:
-                key = f"Item {index + 1}"
-                stat_value = logstash_health.get(key, {}).get("expectation")
+    def __get_difference(self, differences: list, expectations: dict, reports: dict) -> dict:
+        for key in expectations.keys():
+            if key == "help_url":   # help_url URL value may change
+                continue
 
-                if stat_value:
-                    diff = DeepDiff(item["expectation"], stat_value, ignore_order=True).to_dict()
-                    if diff:
-                        differences.append({key: diff})
-                else:
-                    print(f"Stats do not contain an 'expectation' entry for {key}")
+            if type(expectations.get(key)) != type(reports.get(key)):
+                differences.append(f"Scenario expectation and Health API report structure differs for {key}.")
+                return differences
 
+            if isinstance(expectations.get(key), str):
+                if expectations.get(key) != reports.get(key):
+                    differences.append({key: {"expected": expectations.get(key), "got": reports.get(key)}})
+                continue
+            elif isinstance(expectations.get(key), dict):
+                self.__get_difference(differences, expectations.get(key), reports.get(key))
+            elif isinstance(expectations.get(key), list):
+                if not self.__has_intersection(expectations.get(key), reports.get(key)):
+                    differences.append({key: {"expected": expectations.get(key), "got": reports.get(key)}})
+        return differences
+
+    def __is_expected(self, expectations: dict) -> None:
+        reports = self.logstash_health_report_api.get()
+        differences = self.__get_difference([], expectations, reports)
         if differences:
             print("Differences found in 'expectation' section between YAML content and stats:")
             for diff in differences:
-                print(diff)
+                print(f"Difference: {diff}")
             return False
         else:
-            print("YAML 'expectation' section matches the stats.")
             return True
 
-    def on(self, scenario_name: str, scenario_content: list) -> None:
-        print(f"Testing the scenario: {scenario_content}")
-        if self.__is_expected(scenario_content) is False:
+    def on(self, scenario_name: str, expectations: dict) -> None:
+        # retriable check the expectations
+        attempts = 5
+        while self.__is_expected(expectations) is False:
+            attempts = attempts - 1
+            if attempts == 0:
+                break
+            time.sleep(1)
+
+        if attempts == 0:
             raise Exception(f"{scenario_name} failed.")
+        else:
+            print(f"Scenario `{scenario_name}` expectaion meets the health report stats.")
diff --git a/.buildkite/scripts/health-report-tests/tests/abnormal-termination.yaml b/.buildkite/scripts/health-report-tests/tests/abnormal-termination.yaml
@@ -0,0 +1,31 @@
+name: "Abnormally terminated pipeline"
+config:
+  - pipeline.id: abnormally-terminated-pp
+    config.string: |
+      input { heartbeat { interval => 1 } }
+      filter { failure_injector { crash_at => filter } }
+      output { stdout {} }
+    pipeline.workers: 1
+    pipeline.batch.size: 1
+conditions:
+  - full_start_required: true
+expectation:
+  status: "red"
+  symptom: "1 indicator is unhealthy (`pipelines`)"
+  indicators:
+    pipelines:
+      status: "red"
+      symptom: "1 indicator is unhealthy (`abnormally-terminated-pp`)"
+      indicators:
+        abnormally-terminated-pp:
+          status: "red"
+          symptom: "The pipeline is unhealthy; 1 area is impacted and 1 diagnosis is available"
+          diagnosis:
+            - cause: "pipeline is not running, likely because it has encountered an error"
+            - action: "view logs to determine the cause of abnormal pipeline shutdown"
+          impacts:
+            - description: "the pipeline is not currently processing"
+            - impact_areas: ["pipeline_execution"]
+          details:
+            status:
+              state: "TERMINATED"
diff --git a/.buildkite/scripts/health-report-tests/tests/normal-termination.yaml b/.buildkite/scripts/health-report-tests/tests/normal-termination.yaml
@@ -0,0 +1,29 @@
+name: "Successfully terminated pipeline"
+config:
+  - pipeline.id: normally-terminated-pp
+    config.string: |
+      input { generator { count => 1 } }
+      output { stdout {} }
+    pipeline.workers: 1
+    pipeline.batch.size: 1
+conditions:
+  - full_start_required: true
+expectation:
+  status: "yellow"
+  symptom: "1 indicator is concerning (`pipelines`)"
+  indicators:
+    pipelines:
+      status: "yellow"
+      symptom: "1 indicator is concerning (`normally-terminated-pp`)"
+      indicators:
+        normally-terminated-pp:
+          status: "yellow"
+          symptom: "The pipeline is concerning; 1 area is impacted and 1 diagnosis is available"
+          diagnosis:
+            - cause: "pipeline has finished running because its inputs have been closed and events have been processed"
+            - action: "if you expect this pipeline to run indefinitely, you will need to configure its inputs to continue receiving or fetching events"
+          impacts:
+            - impact_areas: ["pipeline_execution"]
+          details:
+            status:
+              state: "FINISHED"
diff --git a/.buildkite/scripts/health-report-tests/tests/slow-start.yaml b/.buildkite/scripts/health-report-tests/tests/slow-start.yaml

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,2 @@`
`1`	`1`	`requests==2.32.3`
`2`		`-deepdiff==8.0.1`
`3`	`2`	`pyyaml==6.0.2`