From 2e53dbb4c0d9bfe79cd79e0a0ece9db065b286df Mon Sep 17 00:00:00 2001 From: juanvallejo Date: Wed, 22 Mar 2017 15:52:35 -0400 Subject: add elasticseatch, fluentd, kibana check --- roles/openshift_health_checker/library/ocutil.py | 74 +++++++ .../openshift_checks/__init__.py | 22 +- .../openshift_checks/logging/__init__.py | 0 .../openshift_checks/logging/curator.py | 61 ++++++ .../openshift_checks/logging/elasticsearch.py | 217 +++++++++++++++++++ .../openshift_checks/logging/fluentd.py | 170 +++++++++++++++ .../openshift_checks/logging/kibana.py | 229 +++++++++++++++++++++ .../openshift_checks/logging/logging.py | 96 +++++++++ .../openshift_health_checker/test/curator_test.py | 68 ++++++ .../test/elasticsearch_test.py | 180 ++++++++++++++++ .../openshift_health_checker/test/fluentd_test.py | 109 ++++++++++ roles/openshift_health_checker/test/kibana_test.py | 218 ++++++++++++++++++++ .../test/logging_check_test.py | 137 ++++++++++++ 13 files changed, 1575 insertions(+), 6 deletions(-) create mode 100644 roles/openshift_health_checker/library/ocutil.py create mode 100644 roles/openshift_health_checker/openshift_checks/logging/__init__.py create mode 100644 roles/openshift_health_checker/openshift_checks/logging/curator.py create mode 100644 roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py create mode 100644 roles/openshift_health_checker/openshift_checks/logging/fluentd.py create mode 100644 roles/openshift_health_checker/openshift_checks/logging/kibana.py create mode 100644 roles/openshift_health_checker/openshift_checks/logging/logging.py create mode 100644 roles/openshift_health_checker/test/curator_test.py create mode 100644 roles/openshift_health_checker/test/elasticsearch_test.py create mode 100644 roles/openshift_health_checker/test/fluentd_test.py create mode 100644 roles/openshift_health_checker/test/kibana_test.py create mode 100644 roles/openshift_health_checker/test/logging_check_test.py (limited to 'roles/openshift_health_checker') diff --git a/roles/openshift_health_checker/library/ocutil.py b/roles/openshift_health_checker/library/ocutil.py new file mode 100644 index 000000000..2e60735d6 --- /dev/null +++ b/roles/openshift_health_checker/library/ocutil.py @@ -0,0 +1,74 @@ +#!/usr/bin/python +"""Interface to OpenShift oc command""" + +import os +import shlex +import shutil +import subprocess + +from ansible.module_utils.basic import AnsibleModule + + +ADDITIONAL_PATH_LOOKUPS = ['/usr/local/bin', os.path.expanduser('~/bin')] + + +def locate_oc_binary(): + """Find and return oc binary file""" + # https://github.com/openshift/openshift-ansible/issues/3410 + # oc can be in /usr/local/bin in some cases, but that may not + # be in $PATH due to ansible/sudo + paths = os.environ.get("PATH", os.defpath).split(os.pathsep) + ADDITIONAL_PATH_LOOKUPS + + oc_binary = 'oc' + + # Use shutil.which if it is available, otherwise fallback to a naive path search + try: + which_result = shutil.which(oc_binary, path=os.pathsep.join(paths)) + if which_result is not None: + oc_binary = which_result + except AttributeError: + for path in paths: + if os.path.exists(os.path.join(path, oc_binary)): + oc_binary = os.path.join(path, oc_binary) + break + + return oc_binary + + +def main(): + """Module that executes commands on a remote OpenShift cluster""" + + module = AnsibleModule( + argument_spec=dict( + namespace=dict(type="str", required=True), + config_file=dict(type="str", required=True), + cmd=dict(type="str", required=True), + extra_args=dict(type="list", default=[]), + ), + ) + + cmd = [ + locate_oc_binary(), + '--config', module.params["config_file"], + '-n', module.params["namespace"], + ] + shlex.split(module.params["cmd"]) + + failed = True + try: + cmd_result = subprocess.check_output(list(cmd), stderr=subprocess.STDOUT) + failed = False + except subprocess.CalledProcessError as exc: + cmd_result = '[rc {}] {}\n{}'.format(exc.returncode, ' '.join(exc.cmd), exc.output) + except OSError as exc: + # we get this when 'oc' is not there + cmd_result = str(exc) + + module.exit_json( + changed=False, + failed=failed, + result=cmd_result, + ) + + +if __name__ == '__main__': + main() diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py index be63d864a..5c9949ced 100644 --- a/roles/openshift_health_checker/openshift_checks/__init__.py +++ b/roles/openshift_health_checker/openshift_checks/__init__.py @@ -66,16 +66,26 @@ class OpenShiftCheck(object): LOADER_EXCLUDES = ( "__init__.py", "mixins.py", + "logging.py", ) -def load_checks(): +def load_checks(path=None, subpkg=""): """Dynamically import all check modules for the side effect of registering checks.""" - return [ - import_module(__package__ + "." + name[:-3]) - for name in os.listdir(os.path.dirname(__file__)) - if name.endswith(".py") and name not in LOADER_EXCLUDES - ] + if path is None: + path = os.path.dirname(__file__) + + modules = [] + + for name in os.listdir(path): + if os.path.isdir(os.path.join(path, name)): + modules = modules + load_checks(os.path.join(path, name), subpkg + "." + name) + continue + + if name.endswith(".py") and name not in LOADER_EXCLUDES: + modules.append(import_module(__package__ + subpkg + "." + name[:-3])) + + return modules def get_var(task_vars, *keys, **kwargs): diff --git a/roles/openshift_health_checker/openshift_checks/logging/__init__.py b/roles/openshift_health_checker/openshift_checks/logging/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py new file mode 100644 index 000000000..c9fc59896 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/curator.py @@ -0,0 +1,61 @@ +""" +Module for performing checks on an Curator logging deployment +""" + +from openshift_checks import get_var +from openshift_checks.logging.logging import LoggingCheck + + +class Curator(LoggingCheck): + """Module that checks an integrated logging Curator deployment""" + + name = "curator" + tags = ["health", "logging"] + + logging_namespace = None + + def run(self, tmp, task_vars): + """Check various things and gather errors. Returns: result as hash""" + + self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging") + curator_pods, error = super(Curator, self).get_pods_for_component( + self.module_executor, + self.logging_namespace, + "curator", + task_vars + ) + if error: + return {"failed": True, "changed": False, "msg": error} + check_error = self.check_curator(curator_pods) + + if check_error: + msg = ("The following Curator deployment issue was found:" + "\n-------\n" + "{}".format(check_error)) + return {"failed": True, "changed": False, "msg": msg} + + # TODO(lmeyer): run it all again for the ops cluster + return {"failed": False, "changed": False, "msg": 'No problems found with Curator deployment.'} + + def check_curator(self, pods): + """Check to see if curator is up and working. Returns: error string""" + if not pods: + return ( + "There are no Curator pods for the logging stack,\n" + "so nothing will prune Elasticsearch indexes.\n" + "Is Curator correctly deployed?" + ) + + not_running = super(Curator, self).not_running_pods(pods) + if len(not_running) == len(pods): + return ( + "The Curator pod is not currently in a running state,\n" + "so Elasticsearch indexes may increase without bound." + ) + if len(pods) - len(not_running) > 1: + return ( + "There is more than one Curator pod running. This should not normally happen.\n" + "Although this doesn't cause any problems, you may want to investigate." + ) + + return None diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py new file mode 100644 index 000000000..01cb35b81 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py @@ -0,0 +1,217 @@ +""" +Module for performing checks on an Elasticsearch logging deployment +""" + +import json +import re + +from openshift_checks import get_var +from openshift_checks.logging.logging import LoggingCheck + + +class Elasticsearch(LoggingCheck): + """Module that checks an integrated logging Elasticsearch deployment""" + + name = "elasticsearch" + tags = ["health", "logging"] + + logging_namespace = None + + def run(self, tmp, task_vars): + """Check various things and gather errors. Returns: result as hash""" + + self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging") + es_pods, error = super(Elasticsearch, self).get_pods_for_component( + self.execute_module, + self.logging_namespace, + "es", + task_vars, + ) + if error: + return {"failed": True, "changed": False, "msg": error} + check_error = self.check_elasticsearch(es_pods, task_vars) + + if check_error: + msg = ("The following Elasticsearch deployment issue was found:" + "\n-------\n" + "{}".format(check_error)) + return {"failed": True, "changed": False, "msg": msg} + + # TODO(lmeyer): run it all again for the ops cluster + return {"failed": False, "changed": False, "msg": 'No problems found with Elasticsearch deployment.'} + + def _not_running_elasticsearch_pods(self, es_pods): + """Returns: list of running pods, list of errors about non-running pods""" + not_running = super(Elasticsearch, self).not_running_pods(es_pods) + if not_running: + return not_running, [( + 'The following Elasticsearch pods are not running:\n' + '{pods}' + 'These pods will not aggregate logs from their nodes.' + ).format(pods=''.join( + " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) + for pod in not_running + ))] + return not_running, [] + + def check_elasticsearch(self, es_pods, task_vars): + """Various checks for elasticsearch. Returns: error string""" + not_running_pods, error_msgs = self._not_running_elasticsearch_pods(es_pods) + running_pods = [pod for pod in es_pods if pod not in not_running_pods] + pods_by_name = { + pod['metadata']['name']: pod for pod in running_pods + # Filter out pods that are not members of a DC + if pod['metadata'].get('labels', {}).get('deploymentconfig') + } + if not pods_by_name: + return 'No logging Elasticsearch pods were found. Is logging deployed?' + error_msgs += self._check_elasticsearch_masters(pods_by_name, task_vars) + error_msgs += self._check_elasticsearch_node_list(pods_by_name, task_vars) + error_msgs += self._check_es_cluster_health(pods_by_name, task_vars) + error_msgs += self._check_elasticsearch_diskspace(pods_by_name, task_vars) + return '\n'.join(error_msgs) + + @staticmethod + def _build_es_curl_cmd(pod_name, url): + base = "exec {name} -- curl -s --cert {base}cert --key {base}key --cacert {base}ca -XGET '{url}'" + return base.format(base="/etc/elasticsearch/secret/admin-", name=pod_name, url=url) + + def _check_elasticsearch_masters(self, pods_by_name, task_vars): + """Check that Elasticsearch masters are sane. Returns: list of error strings""" + es_master_names = set() + error_msgs = [] + for pod_name in pods_by_name.keys(): + # Compare what each ES node reports as master and compare for split brain + get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master") + master_name_str = self._exec_oc(get_master_cmd, [], task_vars) + master_names = (master_name_str or '').split(' ') + if len(master_names) > 1: + es_master_names.add(master_names[1]) + else: + error_msgs.append( + 'No master? Elasticsearch {pod} returned bad string when asked master name:\n' + ' {response}'.format(pod=pod_name, response=master_name_str) + ) + + if not es_master_names: + error_msgs.append('No logging Elasticsearch masters were found. Is logging deployed?') + return '\n'.join(error_msgs) + + if len(es_master_names) > 1: + error_msgs.append( + 'Found multiple Elasticsearch masters according to the pods:\n' + '{master_list}\n' + 'This implies that the masters have "split brain" and are not correctly\n' + 'replicating data for the logging cluster. Log loss is likely to occur.' + .format(master_list='\n'.join(' ' + master for master in es_master_names)) + ) + + return error_msgs + + def _check_elasticsearch_node_list(self, pods_by_name, task_vars): + """Check that reported ES masters are accounted for by pods. Returns: list of error strings""" + + if not pods_by_name: + return ['No logging Elasticsearch masters were found. Is logging deployed?'] + + # get ES cluster nodes + node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes') + cluster_node_data = self._exec_oc(node_cmd, [], task_vars) + try: + cluster_nodes = json.loads(cluster_node_data)['nodes'] + except (ValueError, KeyError): + return [ + 'Failed to query Elasticsearch for the list of ES nodes. The output was:\n' + + cluster_node_data + ] + + # Try to match all ES-reported node hosts to known pods. + error_msgs = [] + for node in cluster_nodes.values(): + # Note that with 1.4/3.4 the pod IP may be used as the master name + if not any(node['host'] in (pod_name, pod['status'].get('podIP')) + for pod_name, pod in pods_by_name.items()): + error_msgs.append( + 'The Elasticsearch cluster reports a member node "{node}"\n' + 'that does not correspond to any known ES pod.'.format(node=node['host']) + ) + + return error_msgs + + def _check_es_cluster_health(self, pods_by_name, task_vars): + """Exec into the elasticsearch pods and check the cluster health. Returns: list of errors""" + error_msgs = [] + for pod_name in pods_by_name.keys(): + cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true') + cluster_health_data = self._exec_oc(cluster_health_cmd, [], task_vars) + try: + health_res = json.loads(cluster_health_data) + if not health_res or not health_res.get('status'): + raise ValueError() + except ValueError: + error_msgs.append( + 'Could not retrieve cluster health status from logging ES pod "{pod}".\n' + 'Response was:\n{output}'.format(pod=pod_name, output=cluster_health_data) + ) + continue + + if health_res['status'] not in ['green', 'yellow']: + error_msgs.append( + 'Elasticsearch cluster health status is RED according to pod "{}"'.format(pod_name) + ) + + return error_msgs + + def _check_elasticsearch_diskspace(self, pods_by_name, task_vars): + """ + Exec into an ES pod and query the diskspace on the persistent volume. + Returns: list of errors + """ + error_msgs = [] + for pod_name in pods_by_name.keys(): + df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name) + disk_output = self._exec_oc(df_cmd, [], task_vars) + lines = disk_output.splitlines() + # expecting one header looking like 'IUse% Use%' and one body line + body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$' + if len(lines) != 2 or len(lines[0].split()) != 2 or not re.match(body_re, lines[1]): + error_msgs.append( + 'Could not retrieve storage usage from logging ES pod "{pod}".\n' + 'Response to `df` command was:\n{output}'.format(pod=pod_name, output=disk_output) + ) + continue + inode_pct, disk_pct = re.match(body_re, lines[1]).groups() + + inode_pct_thresh = get_var(task_vars, 'openshift_check_efk_es_inode_pct', default='90') + if int(inode_pct) >= int(inode_pct_thresh): + error_msgs.append( + 'Inode percent usage on the storage volume for logging ES pod "{pod}"\n' + ' is {pct}, greater than threshold {limit}.\n' + ' Note: threshold can be specified in inventory with {param}'.format( + pod=pod_name, + pct=str(inode_pct), + limit=str(inode_pct_thresh), + param='openshift_check_efk_es_inode_pct', + )) + disk_pct_thresh = get_var(task_vars, 'openshift_check_efk_es_storage_pct', default='80') + if int(disk_pct) >= int(disk_pct_thresh): + error_msgs.append( + 'Disk percent usage on the storage volume for logging ES pod "{pod}"\n' + ' is {pct}, greater than threshold {limit}.\n' + ' Note: threshold can be specified in inventory with {param}'.format( + pod=pod_name, + pct=str(disk_pct), + limit=str(disk_pct_thresh), + param='openshift_check_efk_es_storage_pct', + )) + + return error_msgs + + def _exec_oc(self, cmd_str, extra_args, task_vars): + return super(Elasticsearch, self).exec_oc( + self.execute_module, + self.logging_namespace, + cmd_str, + extra_args, + task_vars, + ) diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py new file mode 100644 index 000000000..627567293 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py @@ -0,0 +1,170 @@ +""" +Module for performing checks on an Fluentd logging deployment +""" + +import json + +from openshift_checks import get_var +from openshift_checks.logging.logging import LoggingCheck + + +class Fluentd(LoggingCheck): + """Module that checks an integrated logging Fluentd deployment""" + name = "fluentd" + tags = ["health", "logging"] + + logging_namespace = None + + def run(self, tmp, task_vars): + """Check various things and gather errors. Returns: result as hash""" + + self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging") + fluentd_pods, error = super(Fluentd, self).get_pods_for_component( + self.execute_module, + self.logging_namespace, + "fluentd", + task_vars, + ) + if error: + return {"failed": True, "changed": False, "msg": error} + check_error = self.check_fluentd(fluentd_pods, task_vars) + + if check_error: + msg = ("The following Fluentd deployment issue was found:" + "\n-------\n" + "{}".format(check_error)) + return {"failed": True, "changed": False, "msg": msg} + + # TODO(lmeyer): run it all again for the ops cluster + return {"failed": False, "changed": False, "msg": 'No problems found with Fluentd deployment.'} + + @staticmethod + def _filter_fluentd_labeled_nodes(nodes_by_name, node_selector): + """Filter to all nodes with fluentd label. Returns dict(name: node), error string""" + label, value = node_selector.split('=', 1) + fluentd_nodes = { + name: node for name, node in nodes_by_name.items() + if node['metadata']['labels'].get(label) == value + } + if not fluentd_nodes: + return None, ( + 'There are no nodes with the fluentd label {label}.\n' + 'This means no logs will be aggregated from the nodes.' + ).format(label=node_selector) + return fluentd_nodes, None + + @staticmethod + def _check_node_labeling(nodes_by_name, fluentd_nodes, node_selector, task_vars): + """Note if nodes are not labeled as expected. Returns: error string""" + intended_nodes = get_var(task_vars, 'openshift_logging_fluentd_hosts', default=['--all']) + if not intended_nodes or '--all' in intended_nodes: + intended_nodes = nodes_by_name.keys() + nodes_missing_labels = set(intended_nodes) - set(fluentd_nodes.keys()) + if nodes_missing_labels: + return ( + 'The following nodes are supposed to be labeled with {label} but are not:\n' + ' {nodes}\n' + 'Fluentd will not aggregate logs from these nodes.' + ).format(label=node_selector, nodes=', '.join(nodes_missing_labels)) + return None + + @staticmethod + def _check_nodes_have_fluentd(pods, fluentd_nodes): + """Make sure fluentd is on all the labeled nodes. Returns: error string""" + unmatched_nodes = fluentd_nodes.copy() + node_names_by_label = { + node['metadata']['labels']['kubernetes.io/hostname']: name + for name, node in fluentd_nodes.items() + } + node_names_by_internal_ip = { + address['address']: name + for name, node in fluentd_nodes.items() + for address in node['status']['addresses'] + if address['type'] == "InternalIP" + } + for pod in pods: + for name in [ + pod['spec']['nodeName'], + node_names_by_internal_ip.get(pod['spec']['nodeName']), + node_names_by_label.get(pod.get('spec', {}).get('host')), + ]: + unmatched_nodes.pop(name, None) + if unmatched_nodes: + return ( + 'The following nodes are supposed to have a Fluentd pod but do not:\n' + '{nodes}' + 'These nodes will not have their logs aggregated.' + ).format(nodes=''.join( + " {}\n".format(name) + for name in unmatched_nodes.keys() + )) + return None + + def _check_fluentd_pods_running(self, pods): + """Make sure all fluentd pods are running. Returns: error string""" + not_running = super(Fluentd, self).not_running_pods(pods) + if not_running: + return ( + 'The following Fluentd pods are supposed to be running but are not:\n' + '{pods}' + 'These pods will not aggregate logs from their nodes.' + ).format(pods=''.join( + " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) + for pod in not_running + )) + return None + + def check_fluentd(self, pods, task_vars): + """Verify fluentd is running everywhere. Returns: error string""" + + node_selector = get_var(task_vars, 'openshift_logging_fluentd_nodeselector', + default='logging-infra-fluentd=true') + + nodes_by_name, error = self.get_nodes_by_name(task_vars) + + if error: + return error + fluentd_nodes, error = self._filter_fluentd_labeled_nodes(nodes_by_name, node_selector) + if error: + return error + + error_msgs = [] + error = self._check_node_labeling(nodes_by_name, fluentd_nodes, node_selector, task_vars) + if error: + error_msgs.append(error) + error = self._check_nodes_have_fluentd(pods, fluentd_nodes) + if error: + error_msgs.append(error) + error = self._check_fluentd_pods_running(pods) + if error: + error_msgs.append(error) + + # Make sure there are no extra fluentd pods + if len(pods) > len(fluentd_nodes): + error_msgs.append( + 'There are more Fluentd pods running than nodes labeled.\n' + 'This may not cause problems with logging but it likely indicates something wrong.' + ) + + return '\n'.join(error_msgs) + + def get_nodes_by_name(self, task_vars): + """Retrieve all the node definitions. Returns: dict(name: node), error string""" + nodes_json = self._exec_oc("get nodes -o json", [], task_vars) + try: + nodes = json.loads(nodes_json) + except ValueError: # no valid json - should not happen + return None, "Could not obtain a list of nodes to validate fluentd. Output from oc get:\n" + nodes_json + if not nodes or not nodes.get('items'): # also should not happen + return None, "No nodes appear to be defined according to the API." + return { + node['metadata']['name']: node + for node in nodes['items'] + }, None + + def _exec_oc(self, cmd_str, extra_args, task_vars): + return super(Fluentd, self).exec_oc(self.execute_module, + self.logging_namespace, + cmd_str, + extra_args, + task_vars) diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py new file mode 100644 index 000000000..442f407b1 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py @@ -0,0 +1,229 @@ +""" +Module for performing checks on a Kibana logging deployment +""" + +import json +import ssl + +try: + from urllib2 import HTTPError, URLError + import urllib2 +except ImportError: + from urllib.error import HTTPError, URLError + import urllib.request as urllib2 + +from openshift_checks import get_var +from openshift_checks.logging.logging import LoggingCheck + + +class Kibana(LoggingCheck): + """Module that checks an integrated logging Kibana deployment""" + + name = "kibana" + tags = ["health", "logging"] + + logging_namespace = None + + def run(self, tmp, task_vars): + """Check various things and gather errors. Returns: result as hash""" + + self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging") + kibana_pods, error = super(Kibana, self).get_pods_for_component( + self.execute_module, + self.logging_namespace, + "kibana", + task_vars, + ) + if error: + return {"failed": True, "changed": False, "msg": error} + check_error = self.check_kibana(kibana_pods) + + if not check_error: + check_error = self._check_kibana_route(task_vars) + + if check_error: + msg = ("The following Kibana deployment issue was found:" + "\n-------\n" + "{}".format(check_error)) + return {"failed": True, "changed": False, "msg": msg} + + # TODO(lmeyer): run it all again for the ops cluster + return {"failed": False, "changed": False, "msg": 'No problems found with Kibana deployment.'} + + def _verify_url_internal(self, url, task_vars): + """ + Try to reach a URL from the host. + Returns: success (bool), reason (for failure) + """ + args = dict( + url=url, + follow_redirects='none', + validate_certs='no', # likely to be signed with internal CA + # TODO(lmeyer): give users option to validate certs + status_code=302, + ) + result = self.execute_module('uri', args, task_vars) + if result.get('failed'): + return result['msg'] + return None + + @staticmethod + def _verify_url_external(url): + """ + Try to reach a URL from ansible control host. + Returns: success (bool), reason (for failure) + """ + # This actually checks from the ansible control host, which may or may not + # really be "external" to the cluster. + + # Disable SSL cert validation to work around internally signed certs + ctx = ssl.create_default_context() + ctx.check_hostname = False # or setting CERT_NONE is refused + ctx.verify_mode = ssl.CERT_NONE + + # Verify that the url is returning a valid response + try: + # We only care if the url connects and responds + return_code = urllib2.urlopen(url, context=ctx).getcode() + except HTTPError as httperr: + return httperr.reason + except URLError as urlerr: + return str(urlerr) + + # there appears to be no way to prevent urlopen from following redirects + if return_code != 200: + return 'Expected success (200) but got return code {}'.format(int(return_code)) + + return None + + def check_kibana(self, pods): + """Check to see if Kibana is up and working. Returns: error string.""" + + if not pods: + return "There are no Kibana pods deployed, so no access to the logging UI." + + not_running = self.not_running_pods(pods) + if len(not_running) == len(pods): + return "No Kibana pod is in a running state, so there is no access to the logging UI." + elif not_running: + return ( + "The following Kibana pods are not currently in a running state:\n" + "{pods}" + "However at least one is, so service may not be impacted." + ).format(pods="".join(" " + pod['metadata']['name'] + "\n" for pod in not_running)) + + return None + + def _get_kibana_url(self, task_vars): + """ + Get kibana route or report error. + Returns: url (or empty), reason for failure + """ + + # Get logging url + get_route = self._exec_oc("get route logging-kibana -o json", [], task_vars) + if not get_route: + return None, 'no_route_exists' + + route = json.loads(get_route) + + # check that the route has been accepted by a router + ingress = route["status"]["ingress"] + # ingress can be null if there is no router, or empty if not routed + if not ingress or not ingress[0]: + return None, 'route_not_accepted' + + host = route.get("spec", {}).get("host") + if not host: + return None, 'route_missing_host' + + return 'https://{}/'.format(host), None + + def _check_kibana_route(self, task_vars): + """ + Check to see if kibana route is up and working. + Returns: error string + """ + known_errors = dict( + no_route_exists=( + 'No route is defined for Kibana in the logging namespace,\n' + 'so the logging stack is not accessible. Is logging deployed?\n' + 'Did something remove the logging-kibana route?' + ), + route_not_accepted=( + 'The logging-kibana route is not being routed by any router.\n' + 'Is the router deployed and working?' + ), + route_missing_host=( + 'The logging-kibana route has no hostname defined,\n' + 'which should never happen. Did something alter its definition?' + ), + ) + + kibana_url, error = self._get_kibana_url(task_vars) + if not kibana_url: + return known_errors.get(error, error) + + # first, check that kibana is reachable from the master. + error = self._verify_url_internal(kibana_url, task_vars) + if error: + if 'urlopen error [Errno 111] Connection refused' in error: + error = ( + 'Failed to connect from this master to Kibana URL {url}\n' + 'Is kibana running, and is at least one router routing to it?' + ).format(url=kibana_url) + elif 'urlopen error [Errno -2] Name or service not known' in error: + error = ( + 'Failed to connect from this master to Kibana URL {url}\n' + 'because the hostname does not resolve.\n' + 'Is DNS configured for the Kibana hostname?' + ).format(url=kibana_url) + elif 'Status code was not' in error: + error = ( + 'A request from this master to the Kibana URL {url}\n' + 'did not return the correct status code (302).\n' + 'This could mean that Kibana is malfunctioning, the hostname is\n' + 'resolving incorrectly, or other network issues. The output was:\n' + ' {error}' + ).format(url=kibana_url, error=error) + return 'Error validating the logging Kibana route:\n' + error + + # in production we would like the kibana route to work from outside the + # cluster too; but that may not be the case, so allow disabling just this part. + if not get_var(task_vars, "openshift_check_efk_kibana_external", default=True): + return None + error = self._verify_url_external(kibana_url) + if error: + if 'urlopen error [Errno 111] Connection refused' in error: + error = ( + 'Failed to connect from the Ansible control host to Kibana URL {url}\n' + 'Is the router for the Kibana hostname exposed externally?' + ).format(url=kibana_url) + elif 'urlopen error [Errno -2] Name or service not known' in error: + error = ( + 'Failed to resolve the Kibana hostname in {url}\n' + 'from the Ansible control host.\n' + 'Is DNS configured to resolve this Kibana hostname externally?' + ).format(url=kibana_url) + elif 'Expected success (200)' in error: + error = ( + 'A request to Kibana at {url}\n' + 'returned the wrong error code:\n' + ' {error}\n' + 'This could mean that Kibana is malfunctioning, the hostname is\n' + 'resolving incorrectly, or other network issues.' + ).format(url=kibana_url, error=error) + error = ( + 'Error validating the logging Kibana route:\n{error}\n' + 'To disable external Kibana route validation, set in your inventory:\n' + ' openshift_check_efk_kibana_external=False' + ).format(error=error) + return error + return None + + def _exec_oc(self, cmd_str, extra_args, task_vars): + return super(Kibana, self).exec_oc(self.execute_module, + self.logging_namespace, + cmd_str, + extra_args, + task_vars) diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py new file mode 100644 index 000000000..05b4d300c --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py @@ -0,0 +1,96 @@ +""" +Util functions for performing checks on an Elasticsearch, Fluentd, and Kibana stack +""" + +import json +import os + +from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var + + +class LoggingCheck(OpenShiftCheck): + """Base class for logging component checks""" + + name = "logging" + + @classmethod + def is_active(cls, task_vars): + return super(LoggingCheck, cls).is_active(task_vars) and cls.is_first_master(task_vars) + + @staticmethod + def is_first_master(task_vars): + """Run only on first master and only when logging is configured. Returns: bool""" + logging_deployed = get_var(task_vars, "openshift_hosted_logging_deploy", default=True) + # Note: It would be nice to use membership in oo_first_master group, however for now it + # seems best to avoid requiring that setup and just check this is the first master. + hostname = get_var(task_vars, "ansible_ssh_host") or [None] + masters = get_var(task_vars, "groups", "masters", default=None) or [None] + return logging_deployed and masters[0] == hostname + + def run(self, tmp, task_vars): + pass + + def get_pods_for_component(self, execute_module, namespace, logging_component, task_vars): + """Get all pods for a given component. Returns: list of pods for component, error string""" + pod_output = self.exec_oc( + execute_module, + namespace, + "get pods -l component={} -o json".format(logging_component), + [], + task_vars + ) + try: + pods = json.loads(pod_output) + if not pods or not pods.get('items'): + raise ValueError() + except ValueError: + # successful run but non-parsing data generally means there were no pods in the namespace + return None, 'There are no pods in the {} namespace. Is logging deployed?'.format(namespace) + + return pods['items'], None + + @staticmethod + def not_running_pods(pods): + """Returns: list of pods not in a ready and running state""" + return [ + pod for pod in pods + if any( + container['ready'] is False + for container in pod['status']['containerStatuses'] + ) or not any( + condition['type'] == 'Ready' and condition['status'] == 'True' + for condition in pod['status']['conditions'] + ) + ] + + @staticmethod + def exec_oc(execute_module=None, namespace="logging", cmd_str="", extra_args=None, task_vars=None): + """ + Execute an 'oc' command in the remote host. + Returns: output of command and namespace, + or raises OpenShiftCheckException on error + """ + config_base = get_var(task_vars, "openshift", "common", "config_base") + args = { + "namespace": namespace, + "config_file": os.path.join(config_base, "master", "admin.kubeconfig"), + "cmd": cmd_str, + "extra_args": list(extra_args) if extra_args else [], + } + + result = execute_module("ocutil", args, task_vars) + if result.get("failed"): + msg = ( + 'Unexpected error using `oc` to validate the logging stack components.\n' + 'Error executing `oc {cmd}`:\n' + '{error}' + ).format(cmd=args['cmd'], error=result['result']) + + if result['result'] == '[Errno 2] No such file or directory': + msg = ( + "This host is supposed to be a master but does not have the `oc` command where expected.\n" + "Has an installation been run on this host yet?" + ) + raise OpenShiftCheckException(msg) + + return result.get("result", "") diff --git a/roles/openshift_health_checker/test/curator_test.py b/roles/openshift_health_checker/test/curator_test.py new file mode 100644 index 000000000..ae108c96e --- /dev/null +++ b/roles/openshift_health_checker/test/curator_test.py @@ -0,0 +1,68 @@ +import pytest + +from openshift_checks.logging.curator import Curator + + +def canned_curator(exec_oc=None): + """Create a Curator check object with canned exec_oc method""" + check = Curator("dummy") # fails if a module is actually invoked + if exec_oc: + check._exec_oc = exec_oc + return check + + +def assert_error(error, expect_error): + if expect_error: + assert error + assert expect_error in error + else: + assert not error + + +plain_curator_pod = { + "metadata": { + "labels": {"component": "curator", "deploymentconfig": "logging-curator"}, + "name": "logging-curator-1", + }, + "status": { + "containerStatuses": [{"ready": True}], + "conditions": [{"status": "True", "type": "Ready"}], + "podIP": "10.10.10.10", + } +} + +not_running_curator_pod = { + "metadata": { + "labels": {"component": "curator", "deploymentconfig": "logging-curator"}, + "name": "logging-curator-2", + }, + "status": { + "containerStatuses": [{"ready": False}], + "conditions": [{"status": "False", "type": "Ready"}], + "podIP": "10.10.10.10", + } +} + + +@pytest.mark.parametrize('pods, expect_error', [ + ( + [], + "no Curator pods", + ), + ( + [plain_curator_pod], + None, + ), + ( + [not_running_curator_pod], + "not currently in a running state", + ), + ( + [plain_curator_pod, plain_curator_pod], + "more than one Curator pod", + ), +]) +def test_get_curator_pods(pods, expect_error): + check = canned_curator() + error = check.check_curator(pods) + assert_error(error, expect_error) diff --git a/roles/openshift_health_checker/test/elasticsearch_test.py b/roles/openshift_health_checker/test/elasticsearch_test.py new file mode 100644 index 000000000..b9d375d8c --- /dev/null +++ b/roles/openshift_health_checker/test/elasticsearch_test.py @@ -0,0 +1,180 @@ +import pytest +import json + +from openshift_checks.logging.elasticsearch import Elasticsearch + +task_vars_config_base = dict(openshift=dict(common=dict(config_base='/etc/origin'))) + + +def canned_elasticsearch(exec_oc=None): + """Create an Elasticsearch check object with canned exec_oc method""" + check = Elasticsearch("dummy") # fails if a module is actually invoked + if exec_oc: + check._exec_oc = exec_oc + return check + + +def assert_error(error, expect_error): + if expect_error: + assert error + assert expect_error in error + else: + assert not error + + +plain_es_pod = { + "metadata": { + "labels": {"component": "es", "deploymentconfig": "logging-es"}, + "name": "logging-es", + }, + "status": { + "conditions": [{"status": "True", "type": "Ready"}], + "containerStatuses": [{"ready": True}], + "podIP": "10.10.10.10", + }, + "_test_master_name_str": "name logging-es", +} + +split_es_pod = { + "metadata": { + "labels": {"component": "es", "deploymentconfig": "logging-es-2"}, + "name": "logging-es-2", + }, + "status": { + "conditions": [{"status": "True", "type": "Ready"}], + "containerStatuses": [{"ready": True}], + "podIP": "10.10.10.10", + }, + "_test_master_name_str": "name logging-es-2", +} + + +def test_check_elasticsearch(): + assert 'No logging Elasticsearch pods' in canned_elasticsearch().check_elasticsearch([], {}) + + # canned oc responses to match so all the checks pass + def _exec_oc(cmd, args, task_vars): + if '_cat/master' in cmd: + return 'name logging-es' + elif '/_nodes' in cmd: + return json.dumps(es_node_list) + elif '_cluster/health' in cmd: + return '{"status": "green"}' + elif ' df ' in cmd: + return 'IUse% Use%\n 3% 4%\n' + else: + raise Exception(cmd) + + assert not canned_elasticsearch(_exec_oc).check_elasticsearch([plain_es_pod], {}) + + +def pods_by_name(pods): + return {pod['metadata']['name']: pod for pod in pods} + + +@pytest.mark.parametrize('pods, expect_error', [ + ( + [], + 'No logging Elasticsearch masters', + ), + ( + [plain_es_pod], + None, + ), + ( + [plain_es_pod, split_es_pod], + 'Found multiple Elasticsearch masters', + ), +]) +def test_check_elasticsearch_masters(pods, expect_error): + test_pods = list(pods) + check = canned_elasticsearch(lambda cmd, args, task_vars: test_pods.pop(0)['_test_master_name_str']) + + errors = check._check_elasticsearch_masters(pods_by_name(pods), task_vars_config_base) + assert_error(''.join(errors), expect_error) + + +es_node_list = { + 'nodes': { + 'random-es-name': { + 'host': 'logging-es', + }}} + + +@pytest.mark.parametrize('pods, node_list, expect_error', [ + ( + [], + {}, + 'No logging Elasticsearch masters', + ), + ( + [plain_es_pod], + es_node_list, + None, + ), + ( + [plain_es_pod], + {}, # empty list of nodes triggers KeyError + "Failed to query", + ), + ( + [split_es_pod], + es_node_list, + 'does not correspond to any known ES pod', + ), +]) +def test_check_elasticsearch_node_list(pods, node_list, expect_error): + check = canned_elasticsearch(lambda cmd, args, task_vars: json.dumps(node_list)) + + errors = check._check_elasticsearch_node_list(pods_by_name(pods), task_vars_config_base) + assert_error(''.join(errors), expect_error) + + +@pytest.mark.parametrize('pods, health_data, expect_error', [ + ( + [plain_es_pod], + [{"status": "green"}], + None, + ), + ( + [plain_es_pod], + [{"no-status": "should bomb"}], + 'Could not retrieve cluster health status', + ), + ( + [plain_es_pod, split_es_pod], + [{"status": "green"}, {"status": "red"}], + 'Elasticsearch cluster health status is RED', + ), +]) +def test_check_elasticsearch_cluster_health(pods, health_data, expect_error): + test_health_data = list(health_data) + check = canned_elasticsearch(lambda cmd, args, task_vars: json.dumps(test_health_data.pop(0))) + + errors = check._check_es_cluster_health(pods_by_name(pods), task_vars_config_base) + assert_error(''.join(errors), expect_error) + + +@pytest.mark.parametrize('disk_data, expect_error', [ + ( + 'df: /elasticsearch/persistent: No such file or directory\n', + 'Could not retrieve storage usage', + ), + ( + 'IUse% Use%\n 3% 4%\n', + None, + ), + ( + 'IUse% Use%\n 95% 40%\n', + 'Inode percent usage on the storage volume', + ), + ( + 'IUse% Use%\n 3% 94%\n', + 'Disk percent usage on the storage volume', + ), +]) +def test_check_elasticsearch_diskspace(disk_data, expect_error): + check = canned_elasticsearch(lambda cmd, args, task_vars: disk_data) + + errors = check._check_elasticsearch_diskspace(pods_by_name([plain_es_pod]), task_vars_config_base) + assert_error(''.join(errors), expect_error) diff --git a/roles/openshift_health_checker/test/fluentd_test.py b/roles/openshift_health_checker/test/fluentd_test.py new file mode 100644 index 000000000..d151c0b19 --- /dev/null +++ b/roles/openshift_health_checker/test/fluentd_test.py @@ -0,0 +1,109 @@ +import pytest +import json + +from openshift_checks.logging.fluentd import Fluentd + + +def canned_fluentd(exec_oc=None): + """Create a Fluentd check object with canned exec_oc method""" + check = Fluentd("dummy") # fails if a module is actually invoked + if exec_oc: + check._exec_oc = exec_oc + return check + + +def assert_error(error, expect_error): + if expect_error: + assert error + assert expect_error in error + else: + assert not error + + +fluentd_pod_node1 = { + "metadata": { + "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"}, + "name": "logging-fluentd-1", + }, + "spec": {"host": "node1", "nodeName": "node1"}, + "status": { + "containerStatuses": [{"ready": True}], + "conditions": [{"status": "True", "type": "Ready"}], + } +} +fluentd_pod_node2_down = { + "metadata": { + "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"}, + "name": "logging-fluentd-2", + }, + "spec": {"host": "node2", "nodeName": "node2"}, + "status": { + "containerStatuses": [{"ready": False}], + "conditions": [{"status": "False", "type": "Ready"}], + } +} +fluentd_node1 = { + "metadata": { + "labels": {"logging-infra-fluentd": "true", "kubernetes.io/hostname": "node1"}, + "name": "node1", + }, + "status": {"addresses": [{"type": "InternalIP", "address": "10.10.1.1"}]}, +} +fluentd_node2 = { + "metadata": { + "labels": {"logging-infra-fluentd": "true", "kubernetes.io/hostname": "hostname"}, + "name": "node2", + }, + "status": {"addresses": [{"type": "InternalIP", "address": "10.10.1.2"}]}, +} +fluentd_node3_unlabeled = { + "metadata": { + "labels": {"kubernetes.io/hostname": "hostname"}, + "name": "node3", + }, + "status": {"addresses": [{"type": "InternalIP", "address": "10.10.1.3"}]}, +} + + +@pytest.mark.parametrize('pods, nodes, expect_error', [ + ( + [], + [], + 'No nodes appear to be defined', + ), + ( + [], + [fluentd_node3_unlabeled], + 'There are no nodes with the fluentd label', + ), + ( + [], + [fluentd_node1, fluentd_node3_unlabeled], + 'Fluentd will not aggregate logs from these nodes.', + ), + ( + [], + [fluentd_node2], + "nodes are supposed to have a Fluentd pod but do not", + ), + ( + [fluentd_pod_node1, fluentd_pod_node1], + [fluentd_node1], + 'more Fluentd pods running than nodes labeled', + ), + ( + [fluentd_pod_node2_down], + [fluentd_node2], + "Fluentd pods are supposed to be running", + ), + ( + [fluentd_pod_node1], + [fluentd_node1], + None, + ), +]) +def test_get_fluentd_pods(pods, nodes, expect_error): + check = canned_fluentd(lambda cmd, args, task_vars: json.dumps(dict(items=nodes))) + + error = check.check_fluentd(pods, {}) + assert_error(error, expect_error) diff --git a/roles/openshift_health_checker/test/kibana_test.py b/roles/openshift_health_checker/test/kibana_test.py new file mode 100644 index 000000000..19140a1b6 --- /dev/null +++ b/roles/openshift_health_checker/test/kibana_test.py @@ -0,0 +1,218 @@ +import pytest +import json + +try: + import urllib2 + from urllib2 import HTTPError, URLError +except ImportError: + from urllib.error import HTTPError, URLError + import urllib.request as urllib2 + +from openshift_checks.logging.kibana import Kibana + + +def canned_kibana(exec_oc=None): + """Create a Kibana check object with canned exec_oc method""" + check = Kibana("dummy") # fails if a module is actually invoked + if exec_oc: + check._exec_oc = exec_oc + return check + + +def assert_error(error, expect_error): + if expect_error: + assert error + assert expect_error in error + else: + assert not error + + +plain_kibana_pod = { + "metadata": { + "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, + "name": "logging-kibana-1", + }, + "status": { + "containerStatuses": [{"ready": True}, {"ready": True}], + "conditions": [{"status": "True", "type": "Ready"}], + } +} +not_running_kibana_pod = { + "metadata": { + "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, + "name": "logging-kibana-2", + }, + "status": { + "containerStatuses": [{"ready": True}, {"ready": False}], + "conditions": [{"status": "True", "type": "Ready"}], + } +} + + +@pytest.mark.parametrize('pods, expect_error', [ + ( + [], + "There are no Kibana pods deployed", + ), + ( + [plain_kibana_pod], + None, + ), + ( + [not_running_kibana_pod], + "No Kibana pod is in a running state", + ), + ( + [plain_kibana_pod, not_running_kibana_pod], + "The following Kibana pods are not currently in a running state", + ), +]) +def test_check_kibana(pods, expect_error): + check = canned_kibana() + error = check.check_kibana(pods) + assert_error(error, expect_error) + + +@pytest.mark.parametrize('route, expect_url, expect_error', [ + ( + None, + None, + 'no_route_exists', + ), + + # test route with no ingress + ( + { + "metadata": { + "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, + "name": "logging-kibana", + }, + "status": { + "ingress": [], + }, + "spec": { + "host": "hostname", + } + }, + None, + 'route_not_accepted', + ), + + # test route with no host + ( + { + "metadata": { + "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, + "name": "logging-kibana", + }, + "status": { + "ingress": [{ + "status": True, + }], + }, + "spec": {}, + }, + None, + 'route_missing_host', + ), + + # test route that looks fine + ( + { + "metadata": { + "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, + "name": "logging-kibana", + }, + "status": { + "ingress": [{ + "status": True, + }], + }, + "spec": { + "host": "hostname", + }, + }, + "https://hostname/", + None, + ), +]) +def test_get_kibana_url(route, expect_url, expect_error): + check = canned_kibana(lambda cmd, args, task_vars: json.dumps(route) if route else "") + + url, error = check._get_kibana_url({}) + if expect_url: + assert url == expect_url + else: + assert not url + if expect_error: + assert error == expect_error + else: + assert not error + + +@pytest.mark.parametrize('exec_result, expect', [ + ( + 'urlopen error [Errno 111] Connection refused', + 'at least one router routing to it?', + ), + ( + 'urlopen error [Errno -2] Name or service not known', + 'DNS configured for the Kibana hostname?', + ), + ( + 'Status code was not [302]: HTTP Error 500: Server error', + 'did not return the correct status code', + ), + ( + 'bork bork bork', + 'bork bork bork', # should pass through + ), +]) +def test_verify_url_internal_failure(exec_result, expect): + check = Kibana(execute_module=lambda module_name, args, task_vars: dict(failed=True, msg=exec_result)) + check._get_kibana_url = lambda task_vars: ('url', None) + + error = check._check_kibana_route({}) + assert_error(error, expect) + + +@pytest.mark.parametrize('lib_result, expect', [ + ( + HTTPError('url', 500, "it broke", hdrs=None, fp=None), + 'it broke', + ), + ( + URLError('it broke'), + 'it broke', + ), + ( + 302, + 'returned the wrong error code', + ), + ( + 200, + None, + ), +]) +def test_verify_url_external_failure(lib_result, expect, monkeypatch): + + class _http_return: + + def __init__(self, code): + self.code = code + + def getcode(self): + return self.code + + def urlopen(url, context): + if type(lib_result) is int: + return _http_return(lib_result) + raise lib_result + monkeypatch.setattr(urllib2, 'urlopen', urlopen) + + check = canned_kibana() + check._get_kibana_url = lambda task_vars: ('url', None) + check._verify_url_internal = lambda url, task_vars: None + + error = check._check_kibana_route({}) + assert_error(error, expect) diff --git a/roles/openshift_health_checker/test/logging_check_test.py b/roles/openshift_health_checker/test/logging_check_test.py new file mode 100644 index 000000000..b6db34fe3 --- /dev/null +++ b/roles/openshift_health_checker/test/logging_check_test.py @@ -0,0 +1,137 @@ +import pytest +import json + +from openshift_checks.logging.logging import LoggingCheck, OpenShiftCheckException + +task_vars_config_base = dict(openshift=dict(common=dict(config_base='/etc/origin'))) + + +logging_namespace = "logging" + + +def canned_loggingcheck(exec_oc=None): + """Create a LoggingCheck object with canned exec_oc method""" + check = LoggingCheck("dummy") # fails if a module is actually invoked + check.logging_namespace = 'logging' + if exec_oc: + check.exec_oc = exec_oc + return check + + +def assert_error(error, expect_error): + if expect_error: + assert error + assert expect_error in error + else: + assert not error + + +plain_es_pod = { + "metadata": { + "labels": {"component": "es", "deploymentconfig": "logging-es"}, + "name": "logging-es", + }, + "status": { + "conditions": [{"status": "True", "type": "Ready"}], + "containerStatuses": [{"ready": True}], + "podIP": "10.10.10.10", + }, + "_test_master_name_str": "name logging-es", +} + +plain_kibana_pod = { + "metadata": { + "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, + "name": "logging-kibana-1", + }, + "status": { + "containerStatuses": [{"ready": True}, {"ready": True}], + "conditions": [{"status": "True", "type": "Ready"}], + } +} + +fluentd_pod_node1 = { + "metadata": { + "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"}, + "name": "logging-fluentd-1", + }, + "spec": {"host": "node1", "nodeName": "node1"}, + "status": { + "containerStatuses": [{"ready": True}], + "conditions": [{"status": "True", "type": "Ready"}], + } +} + +plain_curator_pod = { + "metadata": { + "labels": {"component": "curator", "deploymentconfig": "logging-curator"}, + "name": "logging-curator-1", + }, + "status": { + "containerStatuses": [{"ready": True}], + "conditions": [{"status": "True", "type": "Ready"}], + "podIP": "10.10.10.10", + } +} + + +@pytest.mark.parametrize('problem, expect', [ + ("[Errno 2] No such file or directory", "supposed to be a master"), + ("Permission denied", "Unexpected error using `oc`"), +]) +def test_oc_failure(problem, expect): + def execute_module(module_name, args, task_vars): + if module_name == "ocutil": + return dict(failed=True, result=problem) + return dict(changed=False) + + check = LoggingCheck({}) + + with pytest.raises(OpenShiftCheckException) as excinfo: + check.exec_oc(execute_module, logging_namespace, 'get foo', [], task_vars=task_vars_config_base) + assert expect in str(excinfo) + + +groups_with_first_master = dict(masters=['this-host', 'other-host']) +groups_with_second_master = dict(masters=['other-host', 'this-host']) +groups_not_a_master = dict(masters=['other-host']) + + +@pytest.mark.parametrize('groups, logging_deployed, is_active', [ + (groups_with_first_master, True, True), + (groups_with_first_master, False, False), + (groups_not_a_master, True, False), + (groups_with_second_master, True, False), + (groups_not_a_master, True, False), +]) +def test_is_active(groups, logging_deployed, is_active): + task_vars = dict( + ansible_ssh_host='this-host', + groups=groups, + openshift_hosted_logging_deploy=logging_deployed, + ) + + assert LoggingCheck.is_active(task_vars=task_vars) == is_active + + +@pytest.mark.parametrize('pod_output, expect_pods, expect_error', [ + ( + 'No resources found.', + None, + 'There are no pods in the logging namespace', + ), + ( + json.dumps({'items': [plain_kibana_pod, plain_es_pod, plain_curator_pod, fluentd_pod_node1]}), + [plain_es_pod], + None, + ), +]) +def test_get_pods_for_component(pod_output, expect_pods, expect_error): + check = canned_loggingcheck(lambda exec_module, namespace, cmd, args, task_vars: pod_output) + pods, error = check.get_pods_for_component( + lambda name, args, task_vars: {}, + logging_namespace, + "es", + {} + ) + assert_error(error, expect_error) -- cgit v1.2.3