diff options
Diffstat (limited to 'roles')
| -rw-r--r-- | roles/openshift_health_checker/openshift_checks/docker_storage.py | 177 | ||||
| -rw-r--r-- | roles/openshift_health_checker/test/docker_storage_test.py | 107 | 
2 files changed, 242 insertions, 42 deletions
| diff --git a/roles/openshift_health_checker/openshift_checks/docker_storage.py b/roles/openshift_health_checker/openshift_checks/docker_storage.py index e80691ef3..d2227d244 100644 --- a/roles/openshift_health_checker/openshift_checks/docker_storage.py +++ b/roles/openshift_health_checker/openshift_checks/docker_storage.py @@ -1,5 +1,6 @@  """Check Docker storage driver and usage."""  import json +import os.path  import re  from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var  from openshift_checks.mixins import DockerHostMixin @@ -20,10 +21,27 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):      storage_drivers = ["devicemapper", "overlay", "overlay2"]      max_thinpool_data_usage_percent = 90.0      max_thinpool_meta_usage_percent = 90.0 +    max_overlay_usage_percent = 90.0 + +    # TODO(lmeyer): mention these in the output when check fails +    configuration_variables = [ +        ( +            "max_thinpool_data_usage_percent", +            "For 'devicemapper' storage driver, usage threshold percentage for data. " +            "Format: float. Default: {:.1f}".format(max_thinpool_data_usage_percent), +        ), +        ( +            "max_thinpool_meta_usage_percent", +            "For 'devicemapper' storage driver, usage threshold percentage for metadata. " +            "Format: float. Default: {:.1f}".format(max_thinpool_meta_usage_percent), +        ), +        ( +            "max_overlay_usage_percent", +            "For 'overlay' or 'overlay2' storage driver, usage threshold percentage. " +            "Format: float. Default: {:.1f}".format(max_overlay_usage_percent), +        ), +    ] -    # pylint: disable=too-many-return-statements -    # Reason: permanent stylistic exception; -    #         it is clearer to return on failures and there are just many ways to fail here.      def run(self, tmp, task_vars):          msg, failed, changed = self.ensure_dependencies(task_vars)          if failed: @@ -34,17 +52,17 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):              }          # attempt to get the docker info hash from the API -        info = self.execute_module("docker_info", {}, task_vars=task_vars) -        if info.get("failed"): +        docker_info = self.execute_module("docker_info", {}, task_vars=task_vars) +        if docker_info.get("failed"):              return {"failed": True, "changed": changed,                      "msg": "Failed to query Docker API. Is docker running on this host?"} -        if not info.get("info"):  # this would be very strange +        if not docker_info.get("info"):  # this would be very strange              return {"failed": True, "changed": changed, -                    "msg": "Docker API query missing info:\n{}".format(json.dumps(info))} -        info = info["info"] +                    "msg": "Docker API query missing info:\n{}".format(json.dumps(docker_info))} +        docker_info = docker_info["info"]          # check if the storage driver we saw is valid -        driver = info.get("Driver", "[NONE]") +        driver = docker_info.get("Driver", "[NONE]")          if driver not in self.storage_drivers:              msg = (                  "Detected unsupported Docker storage driver '{driver}'.\n" @@ -53,26 +71,34 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):              return {"failed": True, "changed": changed, "msg": msg}          # driver status info is a list of tuples; convert to dict and validate based on driver -        driver_status = {item[0]: item[1] for item in info.get("DriverStatus", [])} +        driver_status = {item[0]: item[1] for item in docker_info.get("DriverStatus", [])} + +        result = {} +          if driver == "devicemapper": -            if driver_status.get("Data loop file"): -                msg = ( -                    "Use of loopback devices with the Docker devicemapper storage driver\n" -                    "(the default storage configuration) is unsupported in production.\n" -                    "Please use docker-storage-setup to configure a backing storage volume.\n" -                    "See http://red.ht/2rNperO for further information." -                ) -                return {"failed": True, "changed": changed, "msg": msg} -            result = self._check_dm_usage(driver_status, task_vars) -            result['changed'] = result.get('changed', False) or changed -            return result +            result = self.check_devicemapper_support(driver_status, task_vars) -        # TODO(lmeyer): determine how to check usage for overlay2 +        if driver in ['overlay', 'overlay2']: +            result = self.check_overlay_support(docker_info, driver_status, task_vars) -        return {"changed": changed} +        result['changed'] = result.get('changed', False) or changed +        return result -    def _check_dm_usage(self, driver_status, task_vars): -        """ +    def check_devicemapper_support(self, driver_status, task_vars): +        """Check if dm storage driver is supported as configured. Return: result dict.""" +        if driver_status.get("Data loop file"): +            msg = ( +                "Use of loopback devices with the Docker devicemapper storage driver\n" +                "(the default storage configuration) is unsupported in production.\n" +                "Please use docker-storage-setup to configure a backing storage volume.\n" +                "See http://red.ht/2rNperO for further information." +            ) +            return {"failed": True, "msg": msg} +        result = self.check_dm_usage(driver_status, task_vars) +        return result + +    def check_dm_usage(self, driver_status, task_vars): +        """Check usage thresholds for Docker dm storage driver. Return: result dict.          Backing assumptions: We expect devicemapper to be backed by an auto-expanding thin pool          implemented as an LV in an LVM2 VG. This is how docker-storage-setup currently configures          devicemapper storage. The LV is "thin" because it does not use all available storage @@ -83,7 +109,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):          could run out of space first; so we check both.          """          vals = dict( -            vg_free=self._get_vg_free(driver_status.get("Pool Name"), task_vars), +            vg_free=self.get_vg_free(driver_status.get("Pool Name"), task_vars),              data_used=driver_status.get("Data Space Used"),              data_total=driver_status.get("Data Space Total"),              metadata_used=driver_status.get("Metadata Space Used"), @@ -93,7 +119,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):          # convert all human-readable strings to bytes          for key, value in vals.copy().items():              try: -                vals[key + "_bytes"] = self._convert_to_bytes(value) +                vals[key + "_bytes"] = self.convert_to_bytes(value)              except ValueError as err:  # unlikely to hit this from API info, but just to be safe                  return {                      "failed": True, @@ -131,10 +157,12 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):          vals["msg"] = "\n".join(messages or ["Thinpool usage is within thresholds."])          return vals -    def _get_vg_free(self, pool, task_vars): -        # Determine which VG to examine according to the pool name, the only indicator currently -        # available from the Docker API driver info. We assume a name that looks like -        # "vg--name-docker--pool"; vg and lv names with inner hyphens doubled, joined by a hyphen. +    def get_vg_free(self, pool, task_vars): +        """Determine which VG to examine according to the pool name. Return: size vgs reports. +        Pool name is the only indicator currently available from the Docker API driver info. +        We assume a name that looks like "vg--name-docker--pool"; +        vg and lv names with inner hyphens doubled, joined by a hyphen. +        """          match = re.match(r'((?:[^-]|--)+)-(?!-)', pool)  # matches up to the first single hyphen          if not match:  # unlikely, but... be clear if we assumed wrong              raise OpenShiftCheckException( @@ -163,7 +191,8 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):          return size      @staticmethod -    def _convert_to_bytes(string): +    def convert_to_bytes(string): +        """Convert string like "10.3 G" to bytes (binary units assumed). Return: float bytes."""          units = dict(              b=1,              k=1024, @@ -183,3 +212,87 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):              raise ValueError("Cannot convert to a byte size: " + string)          return float(number) * multiplier + +    def check_overlay_support(self, docker_info, driver_status, task_vars): +        """Check if overlay storage driver is supported for this host. Return: result dict.""" +        # check for xfs as backing store +        backing_fs = driver_status.get("Backing Filesystem", "[NONE]") +        if backing_fs != "xfs": +            msg = ( +                "Docker storage drivers 'overlay' and 'overlay2' are only supported with\n" +                "'xfs' as the backing storage, but this host's storage is type '{fs}'." +            ).format(fs=backing_fs) +            return {"failed": True, "msg": msg} + +        # check support for OS and kernel version +        o_s = docker_info.get("OperatingSystem", "[NONE]") +        if "Red Hat Enterprise Linux" in o_s or "CentOS" in o_s: +            # keep it simple, only check enterprise kernel versions; assume everyone else is good +            kernel = docker_info.get("KernelVersion", "[NONE]") +            kernel_arr = [int(num) for num in re.findall(r'\d+', kernel)] +            if kernel_arr < [3, 10, 0, 514]:  # rhel < 7.3 +                msg = ( +                    "Docker storage drivers 'overlay' and 'overlay2' are only supported beginning with\n" +                    "kernel version 3.10.0-514; but Docker reports kernel version {version}." +                ).format(version=kernel) +                return {"failed": True, "msg": msg} +            # NOTE: we could check for --selinux-enabled here but docker won't even start with +            # that option until it's supported in the kernel so we don't need to. + +        return self.check_overlay_usage(docker_info, task_vars) + +    def check_overlay_usage(self, docker_info, task_vars): +        """Check disk usage on OverlayFS backing store volume. Return: result dict.""" +        path = docker_info.get("DockerRootDir", "/var/lib/docker") + "/" + docker_info["Driver"] + +        threshold = get_var(task_vars, "max_overlay_usage_percent", default=self.max_overlay_usage_percent) +        try: +            threshold = float(threshold) +        except ValueError: +            return { +                "failed": True, +                "msg": "Specified 'max_overlay_usage_percent' is not a percentage: {}".format(threshold), +            } + +        mount = self.find_ansible_mount(path, get_var(task_vars, "ansible_mounts")) +        try: +            free_bytes = mount['size_available'] +            total_bytes = mount['size_total'] +            usage = 100.0 * (total_bytes - free_bytes) / total_bytes +        except (KeyError, ZeroDivisionError): +            return { +                "failed": True, +                "msg": "The ansible_mount found for path {} is invalid.\n" +                       "This is likely to be an Ansible bug. The record was:\n" +                       "{}".format(path, json.dumps(mount, indent=2)), +            } + +        if usage > threshold: +            return { +                "failed": True, +                "msg": ( +                    "For Docker OverlayFS mount point {path},\n" +                    "usage percentage {pct:.1f} is higher than threshold {thresh:.1f}." +                ).format(path=mount["mount"], pct=usage, thresh=threshold) +            } + +        return {} + +    # TODO(lmeyer): migrate to base class +    @staticmethod +    def find_ansible_mount(path, ansible_mounts): +        """Return the mount point for path from ansible_mounts.""" + +        mount_for_path = {mount['mount']: mount for mount in ansible_mounts} +        mount_point = path +        while mount_point not in mount_for_path: +            if mount_point in ["/", ""]:  # "/" not in ansible_mounts??? +                break +            mount_point = os.path.dirname(mount_point) + +        try: +            return mount_for_path[mount_point] +        except KeyError: +            known_mounts = ', '.join('"{}"'.format(mount) for mount in sorted(mount_for_path)) or 'none' +            msg = 'Unable to determine mount point for path "{}". Known mount points: {}.' +            raise OpenShiftCheckException(msg.format(path, known_mounts)) diff --git a/roles/openshift_health_checker/test/docker_storage_test.py b/roles/openshift_health_checker/test/docker_storage_test.py index bb25e3f66..99c529054 100644 --- a/roles/openshift_health_checker/test/docker_storage_test.py +++ b/roles/openshift_health_checker/test/docker_storage_test.py @@ -23,7 +23,8 @@ def test_is_active(is_containerized, group_names, is_active):      assert DockerStorage.is_active(task_vars=task_vars) == is_active -non_atomic_task_vars = {"openshift": {"common": {"is_atomic": False}}} +def non_atomic_task_vars(): +    return {"openshift": {"common": {"is_atomic": False}}}  @pytest.mark.parametrize('docker_info, failed, expect_msg', [ @@ -56,7 +57,7 @@ non_atomic_task_vars = {"openshift": {"common": {"is_atomic": False}}}      (          dict(info={              "Driver": "overlay2", -            "DriverStatus": [] +            "DriverStatus": [("Backing Filesystem", "xfs")],          }),          False,          [], @@ -64,6 +65,27 @@ non_atomic_task_vars = {"openshift": {"common": {"is_atomic": False}}}      (          dict(info={              "Driver": "overlay", +            "DriverStatus": [("Backing Filesystem", "btrfs")], +        }), +        True, +        ["storage is type 'btrfs'", "only supported with\n'xfs'"], +    ), +    ( +        dict(info={ +            "Driver": "overlay2", +            "DriverStatus": [("Backing Filesystem", "xfs")], +            "OperatingSystem": "Red Hat Enterprise Linux Server release 7.2 (Maipo)", +            "KernelVersion": "3.10.0-327.22.2.el7.x86_64", +        }), +        True, +        ["Docker reports kernel version 3.10.0-327"], +    ), +    ( +        dict(info={ +            "Driver": "overlay", +            "DriverStatus": [("Backing Filesystem", "xfs")], +            "OperatingSystem": "CentOS", +            "KernelVersion": "3.10.0-514",          }),          False,          [], @@ -85,8 +107,9 @@ def test_check_storage_driver(docker_info, failed, expect_msg):          return docker_info      check = dummy_check(execute_module=execute_module) -    check._check_dm_usage = lambda status, task_vars: dict()  # stub out for this test -    result = check.run(tmp=None, task_vars=non_atomic_task_vars) +    check.check_dm_usage = lambda status, task_vars: dict()  # stub out for this test +    check.check_overlay_usage = lambda info, task_vars: dict()  # stub out for this test +    result = check.run(tmp=None, task_vars=non_atomic_task_vars())      if failed:          assert result["failed"] @@ -146,8 +169,8 @@ not_enough_space = {  ])  def test_dm_usage(task_vars, driver_status, vg_free, success, expect_msg):      check = dummy_check() -    check._get_vg_free = lambda pool, task_vars: vg_free -    result = check._check_dm_usage(driver_status, task_vars) +    check.get_vg_free = lambda pool, task_vars: vg_free +    result = check.check_dm_usage(driver_status, task_vars)      result_success = not result.get("failed")      assert result_success is success @@ -195,10 +218,10 @@ def test_vg_free(pool, command_returns, raises, returns):      check = dummy_check(execute_module=execute_module)      if raises:          with pytest.raises(OpenShiftCheckException) as err: -            check._get_vg_free(pool, {}) +            check.get_vg_free(pool, {})          assert raises in str(err.value)      else: -        ret = check._get_vg_free(pool, {}) +        ret = check.get_vg_free(pool, {})          assert ret == returns @@ -209,7 +232,7 @@ def test_vg_free(pool, command_returns, raises, returns):      ("12g", 12.0 * 1024**3),  ])  def test_convert_to_bytes(string, expect_bytes): -    got = DockerStorage._convert_to_bytes(string) +    got = DockerStorage.convert_to_bytes(string)      assert got == expect_bytes @@ -219,6 +242,70 @@ def test_convert_to_bytes(string, expect_bytes):  ])  def test_convert_to_bytes_error(string):      with pytest.raises(ValueError) as err: -        DockerStorage._convert_to_bytes(string) +        DockerStorage.convert_to_bytes(string)      assert "Cannot convert" in str(err.value)      assert string in str(err.value) + + +ansible_mounts_enough = [{ +    'mount': '/var/lib/docker', +    'size_available': 50 * 10**9, +    'size_total': 50 * 10**9, +}] +ansible_mounts_not_enough = [{ +    'mount': '/var/lib/docker', +    'size_available': 0, +    'size_total': 50 * 10**9, +}] +ansible_mounts_missing_fields = [dict(mount='/var/lib/docker')] +ansible_mounts_zero_size = [{ +    'mount': '/var/lib/docker', +    'size_available': 0, +    'size_total': 0, +}] + + +@pytest.mark.parametrize('ansible_mounts, threshold, expect_fail, expect_msg', [ +    ( +        ansible_mounts_enough, +        None, +        False, +        [], +    ), +    ( +        ansible_mounts_not_enough, +        None, +        True, +        ["usage percentage", "higher than threshold"], +    ), +    ( +        ansible_mounts_not_enough, +        "bogus percent", +        True, +        ["is not a percentage"], +    ), +    ( +        ansible_mounts_missing_fields, +        None, +        True, +        ["Ansible bug"], +    ), +    ( +        ansible_mounts_zero_size, +        None, +        True, +        ["Ansible bug"], +    ), +]) +def test_overlay_usage(ansible_mounts, threshold, expect_fail, expect_msg): +    check = dummy_check() +    task_vars = non_atomic_task_vars() +    task_vars["ansible_mounts"] = ansible_mounts +    if threshold is not None: +        task_vars["max_overlay_usage_percent"] = threshold +    docker_info = dict(DockerRootDir="/var/lib/docker", Driver="overlay") +    result = check.check_overlay_usage(docker_info, task_vars) + +    assert expect_fail == bool(result.get("failed")) +    for msg in expect_msg: +        assert msg in result["msg"] | 
