From c096aff90d117e485e5bea17c9386d16c571fb5d Mon Sep 17 00:00:00 2001
From: Russell Teague <rteague@redhat.com>
Date: Mon, 25 Sep 2017 14:54:26 -0400
Subject: OpenShift-Ansible Installer Checkpointing

- Added installer_checkpoint role and callback plugin
- Added checkpoint 'Start' and 'End' plays to each installation phase

Additional items related to enabling proper checkpointing:
- Removed openshift_set_hostname and related task (related to 3.0)
- Added openshift-hosted entry point playbook
- Moved openshift metrics and logging out of openshift_hosted playbook
- Moved API Aggregation play to master install
---
 roles/installer_checkpoint/README.md               | 177 +++++++++++++++++
 .../callback_plugins/installer_checkpoint.py       | 182 +++++++++++++++++
 roles/openshift_metrics/tasks/main.yaml            |   4 +-
 .../files/openshift-ansible-catalog-console.js     |   2 -
 .../tasks/wire_aggregator.yml                      | 215 ---------------------
 5 files changed, 361 insertions(+), 219 deletions(-)
 create mode 100644 roles/installer_checkpoint/README.md
 create mode 100644 roles/installer_checkpoint/callback_plugins/installer_checkpoint.py
 delete mode 100644 roles/openshift_service_catalog/files/openshift-ansible-catalog-console.js
 delete mode 100644 roles/openshift_service_catalog/tasks/wire_aggregator.yml

(limited to 'roles')

diff --git a/roles/installer_checkpoint/README.md b/roles/installer_checkpoint/README.md
new file mode 100644
index 000000000..321acca21
--- /dev/null
+++ b/roles/installer_checkpoint/README.md
@@ -0,0 +1,177 @@
+OpenShift-Ansible Installer Checkpoint
+======================================
+
+A complete OpenShift cluster installation is comprised of many different
+components which can take 30 minutes to several hours to complete.  If the
+installation should fail, it could be confusing to understand at which component
+the failure occurred.  Additionally, it may be desired to re-run only the
+component which failed instead of starting over from the beginning.  Components
+which came after the failed component would also need to be run individually.
+
+Design
+------
+
+The Installer Checkpoint implements an Ansible callback plugin to allow
+displaying and logging of the installer status at the end of a playbook run.
+
+To ensure the callback plugin is loaded, regardless of ansible.cfg file
+configuration, the plugin has been placed inside the installer_checkpoint role
+which must be called early in playbook execution. The `std_include.yml` playbook
+is run first for all entry point playbooks, therefore, the initialization of the
+checkpoint plugin has been placed at the beginning of that file.
+
+Playbooks use the [set_stats][set_stats] Ansible module to set a custom stats
+variable indicating the status of the phase being executed.
+
+The installer_checkpoint.py callback plugin extends the Ansible
+`v2_playbook_on_stats` method, which is called at the end of a playbook run, to
+display the status of each phase which was run.  The INSTALLER STATUS report is
+displayed immediately following the PLAY RECAP.
+
+Phases of cluster installation are mapped to the steps in the
+[common/openshift-cluster/config.yml][openshift_cluster_config] playbook.
+
+To correctly display the order of the installer phases, the `installer_phases`
+variable defines the phase or component order.
+
+```python
+        # Set the order of the installer phases
+        installer_phases = [
+            'installer_phase_initialize',
+            'installer_phase_etcd',
+            'installer_phase_nfs',
+            'installer_phase_loadbalancer',
+            'installer_phase_master',
+            'installer_phase_master_additional',
+            'installer_phase_node',
+            'installer_phase_glusterfs',
+            'installer_phase_hosted',
+            'installer_phase_metrics',
+            'installer_phase_logging',
+            'installer_phase_servicecatalog',
+        ]
+```
+
+Additional attributes, such as display title and component playbook, of each
+phase are stored in the `phase_attributes` variable.
+
+```python
+        # Define the attributes of the installer phases
+        phase_attributes = {
+            'installer_phase_initialize': {
+                'title': 'Initialization',
+                'playbook': ''
+            },
+            'installer_phase_etcd': {
+                'title': 'etcd Install',
+                'playbook': 'playbooks/byo/openshift-etcd/config.yml'
+            },
+            'installer_phase_nfs': {
+                'title': 'NFS Install',
+                'playbook': 'playbooks/byo/openshift-nfs/config.yml'
+            },
+            #...
+        }
+```
+
+Usage
+-----
+
+In order to indicate the beginning of a component installation, a play must be
+added to the beginning of the main playbook for the component to set the phase
+status to "In Progress".  Additionally, a play must be added after the last play
+for that component to set the phase status to "Complete".  
+
+The following example shows the first play of the 'installer phase' loading the
+`installer_checkpoint` role, as well as the `set_stats` task for setting
+`installer_phase_initialize` to "In Progress".  Various plays are run for the
+phase/component and then a final play for setting `installer_hase_initialize` to
+"Complete".
+
+```yaml
+# common/openshift-cluster/std_include.yml
+---
+- name: Initialization Checkpoint Start
+  hosts: localhost
+  connection: local
+  gather_facts: false
+  roles:
+  - installer_checkpoint
+  tasks:
+  - name: Set install initialization 'In Progress'
+    set_stats:
+      data:
+        installer_phase_initialize: "In Progress"
+      aggregate: false
+
+#...
+# Various plays here
+#...
+
+- name: Initialization Checkpoint End
+  hosts: localhost
+  connection: local
+  gather_facts: false
+  tasks:
+  - name: Set install initialization 'Complete'
+    set_stats:
+      data:
+        installer_phase_initialize: "Complete"
+      aggregate: false
+``` 
+
+Each phase or component of the installer will follow a similar pattern, with the
+exception that the `installer_checkpoint` role does not need to be called since
+it was already loaded by the play in `std_include.yml`.  It is important to
+place the 'In Progress' and 'Complete' plays as the first and last plays of the
+phase or component.
+ 
+Examples
+--------
+
+Example display of a successful playbook run:
+
+```
+PLAY RECAP *********************************************************************
+master01.example.com : ok=158  changed=16   unreachable=0    failed=0
+node01.example.com   : ok=469  changed=74   unreachable=0    failed=0
+node02.example.com   : ok=157  changed=17   unreachable=0    failed=0
+localhost            : ok=24   changed=0    unreachable=0    failed=0
+
+
+INSTALLER STATUS ***************************************************************
+Initialization             : Complete
+etcd Install               : Complete
+NFS Install                : Not Started
+Load balancer Install      : Not Started
+Master Install             : Complete
+Master Additional Install  : Complete
+Node Install               : Complete
+GlusterFS Install          : Not Started
+Hosted Install             : Complete
+Metrics Install            : Not Started
+Logging Install            : Not Started
+Service Catalog Install    : Not Started
+```
+
+Example display if a failure occurs during execution:
+
+```
+INSTALLER STATUS ***************************************************************
+Initialization             : Complete
+etcd Install               : Complete
+NFS Install                : Not Started
+Load balancer Install      : Not Started
+Master Install             : In Progress
+     This phase can be restarted by running: playbooks/byo/openshift-master/config.yml
+Master Additional Install  : Not Started
+Node Install               : Not Started
+GlusterFS Install          : Not Started
+Hosted Install             : Not Started
+Metrics Install            : Not Started
+Logging Install            : Not Started
+Service Catalog Install    : Not Started
+```
+
+[set_stats]: http://docs.ansible.com/ansible/latest/set_stats_module.html
+[openshift_cluster_config]: https://github.com/openshift/openshift-ansible/blob/master/playbooks/common/openshift-cluster/config.yml
diff --git a/roles/installer_checkpoint/callback_plugins/installer_checkpoint.py b/roles/installer_checkpoint/callback_plugins/installer_checkpoint.py
new file mode 100644
index 000000000..033240e62
--- /dev/null
+++ b/roles/installer_checkpoint/callback_plugins/installer_checkpoint.py
@@ -0,0 +1,182 @@
+"""Ansible callback plugin to print a summary completion status of installation
+phases.
+"""
+from ansible.plugins.callback import CallbackBase
+from ansible import constants as C
+
+DOCUMENTATION = '''
+
+'''
+
+EXAMPLES = '''
+---------------------------------------------
+Example display of a successful playbook run:
+
+PLAY RECAP *********************************************************************
+master01.example.com : ok=158  changed=16   unreachable=0    failed=0
+node01.example.com   : ok=469  changed=74   unreachable=0    failed=0
+node02.example.com   : ok=157  changed=17   unreachable=0    failed=0
+localhost            : ok=24   changed=0    unreachable=0    failed=0
+
+
+INSTALLER STATUS ***************************************************************
+Initialization             : Complete
+etcd Install               : Complete
+NFS Install                : Not Started
+Load balancer Install      : Not Started
+Master Install             : Complete
+Master Additional Install  : Complete
+Node Install               : Complete
+GlusterFS Install          : Not Started
+Hosted Install             : Complete
+Metrics Install            : Not Started
+Logging Install            : Not Started
+Service Catalog Install    : Not Started
+
+-----------------------------------------------------
+Example display if a failure occurs during execution:
+
+INSTALLER STATUS ***************************************************************
+Initialization             : Complete
+etcd Install               : Complete
+NFS Install                : Not Started
+Load balancer Install      : Not Started
+Master Install             : In Progress
+     This phase can be restarted by running: playbooks/byo/openshift-master/config.yml
+Master Additional Install  : Not Started
+Node Install               : Not Started
+GlusterFS Install          : Not Started
+Hosted Install             : Not Started
+Metrics Install            : Not Started
+Logging Install            : Not Started
+Service Catalog Install    : Not Started
+
+'''
+
+
+class CallbackModule(CallbackBase):
+    """This callback summarizes installation phase status."""
+
+    CALLBACK_VERSION = 2.0
+    CALLBACK_TYPE = 'aggregate'
+    CALLBACK_NAME = 'installer_checkpoint'
+    CALLBACK_NEEDS_WHITELIST = False
+
+    def __init__(self):
+        super(CallbackModule, self).__init__()
+
+    def v2_playbook_on_stats(self, stats):
+
+        # Set the order of the installer phases
+        installer_phases = [
+            'installer_phase_initialize',
+            'installer_phase_etcd',
+            'installer_phase_nfs',
+            'installer_phase_loadbalancer',
+            'installer_phase_master',
+            'installer_phase_master_additional',
+            'installer_phase_node',
+            'installer_phase_glusterfs',
+            'installer_phase_hosted',
+            'installer_phase_metrics',
+            'installer_phase_logging',
+            'installer_phase_servicecatalog',
+        ]
+
+        # Define the attributes of the installer phases
+        phase_attributes = {
+            'installer_phase_initialize': {
+                'title': 'Initialization',
+                'playbook': ''
+            },
+            'installer_phase_etcd': {
+                'title': 'etcd Install',
+                'playbook': 'playbooks/byo/openshift-etcd/config.yml'
+            },
+            'installer_phase_nfs': {
+                'title': 'NFS Install',
+                'playbook': 'playbooks/byo/openshift-nfs/config.yml'
+            },
+            'installer_phase_loadbalancer': {
+                'title': 'Load balancer Install',
+                'playbook': 'playbooks/byo/openshift-loadbalancer/config.yml'
+            },
+            'installer_phase_master': {
+                'title': 'Master Install',
+                'playbook': 'playbooks/byo/openshift-master/config.yml'
+            },
+            'installer_phase_master_additional': {
+                'title': 'Master Additional Install',
+                'playbook': 'playbooks/byo/openshift-master/additional_config.yml'
+            },
+            'installer_phase_node': {
+                'title': 'Node Install',
+                'playbook': 'playbooks/byo/openshift-node/config.yml'
+            },
+            'installer_phase_glusterfs': {
+                'title': 'GlusterFS Install',
+                'playbook': 'playbooks/byo/openshift-glusterfs/config.yml'
+            },
+            'installer_phase_hosted': {
+                'title': 'Hosted Install',
+                'playbook': 'playbooks/byo/openshift-cluster/openshift-hosted.yml'
+            },
+            'installer_phase_metrics': {
+                'title': 'Metrics Install',
+                'playbook': 'playbooks/byo/openshift-cluster/openshift-metrics.yml'
+            },
+            'installer_phase_logging': {
+                'title': 'Logging Install',
+                'playbook': 'playbooks/byo/openshift-cluster/openshift-logging.yml'
+            },
+            'installer_phase_servicecatalog': {
+                'title': 'Service Catalog Install',
+                'playbook': 'playbooks/byo/openshift-cluster/service-catalog.yml'
+            },
+        }
+
+        # Find the longest phase title
+        max_column = 0
+        for phase in phase_attributes:
+            max_column = max(max_column, len(phase_attributes[phase]['title']))
+
+        if '_run' in stats.custom:
+            self._display.banner('INSTALLER STATUS')
+            for phase in installer_phases:
+                phase_title = phase_attributes[phase]['title']
+                padding = max_column - len(phase_title) + 2
+                if phase in stats.custom['_run']:
+                    phase_status = stats.custom['_run'][phase]
+                    self._display.display(
+                        '{}{}: {}'.format(phase_title, ' ' * padding, phase_status),
+                        color=self.phase_color(phase_status))
+                    if phase_status == 'In Progress' and phase != 'installer_phase_initialize':
+                        self._display.display(
+                            '\tThis phase can be restarted by running: {}'.format(
+                                phase_attributes[phase]['playbook']))
+                else:
+                    # Phase was not found in custom stats
+                    self._display.display(
+                        '{}{}: {}'.format(phase_title, ' ' * padding, 'Not Started'),
+                        color=C.COLOR_SKIP)
+
+        self._display.display("", screen_only=True)
+
+    def phase_color(self, status):
+        """ Return color code for installer phase"""
+        valid_status = [
+            'In Progress',
+            'Complete',
+        ]
+
+        if status not in valid_status:
+            self._display.warning('Invalid phase status defined: {}'.format(status))
+
+        if status == 'Complete':
+            phase_color = C.COLOR_OK
+        elif status == 'In Progress':
+            phase_color = C.COLOR_ERROR
+        else:
+            phase_color = C.COLOR_WARN
+
+        return phase_color
diff --git a/roles/openshift_metrics/tasks/main.yaml b/roles/openshift_metrics/tasks/main.yaml
index 0461039fc..c92458c50 100644
--- a/roles/openshift_metrics/tasks/main.yaml
+++ b/roles/openshift_metrics/tasks/main.yaml
@@ -45,11 +45,11 @@
 
 - include: install_metrics.yaml
   when:
-    - openshift_metrics_install_metrics | default(false) | bool
+    - openshift_metrics_install_metrics | bool
 
 - include: uninstall_metrics.yaml
   when:
-    - openshift_metrics_uninstall_metrics | default(false) | bool
+    - openshift_metrics_uninstall_metrics | bool
 
 - include: uninstall_hosa.yaml
   when: not openshift_metrics_install_hawkular_agent | bool
diff --git a/roles/openshift_service_catalog/files/openshift-ansible-catalog-console.js b/roles/openshift_service_catalog/files/openshift-ansible-catalog-console.js
deleted file mode 100644
index d0a9f11dc..000000000
--- a/roles/openshift_service_catalog/files/openshift-ansible-catalog-console.js
+++ /dev/null
@@ -1,2 +0,0 @@
-// empty file so that the master-config can still point to a file that exists
-// this file will be replaced by the template service broker role if enabled
diff --git a/roles/openshift_service_catalog/tasks/wire_aggregator.yml b/roles/openshift_service_catalog/tasks/wire_aggregator.yml
deleted file mode 100644
index 300a7db62..000000000
--- a/roles/openshift_service_catalog/tasks/wire_aggregator.yml
+++ /dev/null
@@ -1,215 +0,0 @@
----
-- name: Make temp cert dir
-  command: mktemp -d /tmp/openshift-service-catalog-ansible-XXXXXX
-  register: certtemp
-  changed_when: False
-
-- name: Check for First Master Aggregator Signer cert
-  stat:
-    path: /etc/origin/master/front-proxy-ca.crt
-  register: first_proxy_ca_crt
-  changed_when: false
-  delegate_to: "{{ first_master }}"
-
-- name: Check for First Master Aggregator Signer key
-  stat:
-    path: /etc/origin/master/front-proxy-ca.crt
-  register: first_proxy_ca_key
-  changed_when: false
-  delegate_to: "{{ first_master }}"
-
-# TODO: this currently has a bug where hostnames are required
-- name: Creating First Master Aggregator signer certs
-  command: >
-    {{ hostvars[first_master].openshift.common.client_binary }} adm ca create-signer-cert
-    --cert=/etc/origin/master/front-proxy-ca.crt
-    --key=/etc/origin/master/front-proxy-ca.key
-    --serial=/etc/origin/master/ca.serial.txt
-  delegate_to: "{{ first_master }}"
-  when:
-  - not first_proxy_ca_crt.stat.exists
-  - not first_proxy_ca_key.stat.exists
-
-- name: Check for Aggregator Signer cert
-  stat:
-    path: /etc/origin/master/front-proxy-ca.crt
-  register: proxy_ca_crt
-  changed_when: false
-
-- name: Check for Aggregator Signer key
-  stat:
-    path: /etc/origin/master/front-proxy-ca.crt
-  register: proxy_ca_key
-  changed_when: false
-
-- name: Copy Aggregator Signer certs from first master
-  fetch:
-    src: "/etc/origin/master/{{ item }}"
-    dest: "{{ certtemp.stdout }}/{{ item }}"
-    flat: yes
-  with_items:
-  - front-proxy-ca.crt
-  - front-proxy-ca.key
-  delegate_to: "{{ first_master }}"
-  when:
-  - not proxy_ca_key.stat.exists
-  - not proxy_ca_crt.stat.exists
-
-- name: Copy Aggregator Signer certs to host
-  copy:
-    src: "{{ certtemp.stdout }}/{{ item }}"
-    dest: "/etc/origin/master/{{ item }}"
-  with_items:
-  - front-proxy-ca.crt
-  - front-proxy-ca.key
-  when:
-  - not proxy_ca_key.stat.exists
-  - not proxy_ca_crt.stat.exists
-
-#  oc_adm_ca_server_cert:
-#    cert: /etc/origin/master/front-proxy-ca.crt
-#    key: /etc/origin/master/front-proxy-ca.key
-
-- name: Check for first master api-client config
-  stat:
-    path: /etc/origin/master/aggregator-front-proxy.kubeconfig
-  register: first_front_proxy_kubeconfig
-  delegate_to: "{{ first_master }}"
-  run_once: true
-
-# create-api-client-config generates a ca.crt file which will
-# overwrite the OpenShift CA certificate.  Generate the aggregator
-# kubeconfig in a temporary directory and then copy files into the
-# master config dir to avoid overwriting ca.crt.
-- block:
-  - name: Create first master api-client config for Aggregator
-    command: >
-      {{ hostvars[first_master].openshift.common.client_binary }} adm create-api-client-config
-      --certificate-authority=/etc/origin/master/front-proxy-ca.crt
-      --signer-cert=/etc/origin/master/front-proxy-ca.crt
-      --signer-key=/etc/origin/master/front-proxy-ca.key
-      --user aggregator-front-proxy
-      --client-dir={{ certtemp.stdout }}
-      --signer-serial=/etc/origin/master/ca.serial.txt
-    delegate_to: "{{ first_master }}"
-    run_once: true
-  - name: Copy first master api-client config for Aggregator
-    copy:
-      src: "{{ certtemp.stdout }}/{{ item }}"
-      dest: "/etc/origin/master/"
-      remote_src: true
-    with_items:
-    - aggregator-front-proxy.crt
-    - aggregator-front-proxy.key
-    - aggregator-front-proxy.kubeconfig
-    delegate_to: "{{ first_master }}"
-    run_once: true
-  when:
-  - not first_front_proxy_kubeconfig.stat.exists
-
-- name: Check for api-client config
-  stat:
-    path: /etc/origin/master/aggregator-front-proxy.kubeconfig
-  register: front_proxy_kubeconfig
-
-- name: Copy api-client config from first master
-  fetch:
-    src: "/etc/origin/master/{{ item }}"
-    dest: "{{ certtemp.stdout }}/{{ item }}"
-    flat: yes
-  delegate_to: "{{ first_master }}"
-  with_items:
-  - aggregator-front-proxy.crt
-  - aggregator-front-proxy.key
-  - aggregator-front-proxy.kubeconfig
-  when:
-  - not front_proxy_kubeconfig.stat.exists
-
-- name: Copy api-client config to host
-  copy:
-    src: "{{ certtemp.stdout }}/{{ item }}"
-    dest: "/etc/origin/master/{{ item }}"
-  with_items:
-  - aggregator-front-proxy.crt
-  - aggregator-front-proxy.key
-  - aggregator-front-proxy.kubeconfig
-  when:
-  - not front_proxy_kubeconfig.stat.exists
-
-- name: copy tech preview extension file for service console UI
-  copy:
-    src: openshift-ansible-catalog-console.js
-    dest: /etc/origin/master/openshift-ansible-catalog-console.js
-
-- name: Update master config
-  yedit:
-    state: present
-    src: /etc/origin/master/master-config.yaml
-    edits:
-    - key: aggregatorConfig.proxyClientInfo.certFile
-      value: aggregator-front-proxy.crt
-    - key: aggregatorConfig.proxyClientInfo.keyFile
-      value: aggregator-front-proxy.key
-    - key: authConfig.requestHeader.clientCA
-      value: front-proxy-ca.crt
-    - key: authConfig.requestHeader.clientCommonNames
-      value: [aggregator-front-proxy]
-    - key: authConfig.requestHeader.usernameHeaders
-      value: [X-Remote-User]
-    - key: authConfig.requestHeader.groupHeaders
-      value: [X-Remote-Group]
-    - key: authConfig.requestHeader.extraHeaderPrefixes
-      value: [X-Remote-Extra-]
-    - key: assetConfig.extensionScripts
-      value: [/etc/origin/master/openshift-ansible-catalog-console.js]
-    - key: kubernetesMasterConfig.apiServerArguments.runtime-config
-      value: [apis/settings.k8s.io/v1alpha1=true]
-    - key: admissionConfig.pluginConfig.PodPreset.configuration.kind
-      value: DefaultAdmissionConfig
-    - key: admissionConfig.pluginConfig.PodPreset.configuration.apiVersion
-      value: v1
-    - key: admissionConfig.pluginConfig.PodPreset.configuration.disable
-      value: false
-  register: yedit_output
-
-#restart master serially here
-- name: restart master api
-  systemd: name={{ openshift.common.service_type }}-master-api state=restarted
-  when:
-  - yedit_output.changed
-  - openshift.master.cluster_method == 'native'
-
-- name: restart master controllers
-  systemd: name={{ openshift.common.service_type }}-master-controllers state=restarted
-  when:
-  - yedit_output.changed
-  - openshift.master.cluster_method == 'native'
-
-- name: Verify API Server
-  # Using curl here since the uri module requires python-httplib2 and
-  # wait_for port doesn't provide health information.
-  command: >
-    curl --silent --tlsv1.2
-    {% if openshift.common.version_gte_3_2_or_1_2 | bool %}
-    --cacert {{ openshift.common.config_base }}/master/ca-bundle.crt
-    {% else %}
-    --cacert {{ openshift.common.config_base }}/master/ca.crt
-    {% endif %}
-    {{ openshift.master.api_url }}/healthz/ready
-  args:
-    # Disables the following warning:
-    # Consider using get_url or uri module rather than running curl
-    warn: no
-  register: api_available_output
-  until: api_available_output.stdout == 'ok'
-  retries: 120
-  delay: 1
-  changed_when: false
-  when:
-  - yedit_output.changed
-
-- name: Delete temp directory
-  file:
-    name: "{{ certtemp.stdout }}"
-    state: absent
-  changed_when: False
-- 
cgit v1.2.3