From c096aff90d117e485e5bea17c9386d16c571fb5d Mon Sep 17 00:00:00 2001 From: Russell Teague Date: Mon, 25 Sep 2017 14:54:26 -0400 Subject: OpenShift-Ansible Installer Checkpointing - Added installer_checkpoint role and callback plugin - Added checkpoint 'Start' and 'End' plays to each installation phase Additional items related to enabling proper checkpointing: - Removed openshift_set_hostname and related task (related to 3.0) - Added openshift-hosted entry point playbook - Moved openshift metrics and logging out of openshift_hosted playbook - Moved API Aggregation play to master install --- roles/installer_checkpoint/README.md | 177 +++++++++++++++++ .../callback_plugins/installer_checkpoint.py | 182 +++++++++++++++++ roles/openshift_metrics/tasks/main.yaml | 4 +- .../files/openshift-ansible-catalog-console.js | 2 - .../tasks/wire_aggregator.yml | 215 --------------------- 5 files changed, 361 insertions(+), 219 deletions(-) create mode 100644 roles/installer_checkpoint/README.md create mode 100644 roles/installer_checkpoint/callback_plugins/installer_checkpoint.py delete mode 100644 roles/openshift_service_catalog/files/openshift-ansible-catalog-console.js delete mode 100644 roles/openshift_service_catalog/tasks/wire_aggregator.yml (limited to 'roles') diff --git a/roles/installer_checkpoint/README.md b/roles/installer_checkpoint/README.md new file mode 100644 index 000000000..321acca21 --- /dev/null +++ b/roles/installer_checkpoint/README.md @@ -0,0 +1,177 @@ +OpenShift-Ansible Installer Checkpoint +====================================== + +A complete OpenShift cluster installation is comprised of many different +components which can take 30 minutes to several hours to complete. If the +installation should fail, it could be confusing to understand at which component +the failure occurred. Additionally, it may be desired to re-run only the +component which failed instead of starting over from the beginning. Components +which came after the failed component would also need to be run individually. + +Design +------ + +The Installer Checkpoint implements an Ansible callback plugin to allow +displaying and logging of the installer status at the end of a playbook run. + +To ensure the callback plugin is loaded, regardless of ansible.cfg file +configuration, the plugin has been placed inside the installer_checkpoint role +which must be called early in playbook execution. The `std_include.yml` playbook +is run first for all entry point playbooks, therefore, the initialization of the +checkpoint plugin has been placed at the beginning of that file. + +Playbooks use the [set_stats][set_stats] Ansible module to set a custom stats +variable indicating the status of the phase being executed. + +The installer_checkpoint.py callback plugin extends the Ansible +`v2_playbook_on_stats` method, which is called at the end of a playbook run, to +display the status of each phase which was run. The INSTALLER STATUS report is +displayed immediately following the PLAY RECAP. + +Phases of cluster installation are mapped to the steps in the +[common/openshift-cluster/config.yml][openshift_cluster_config] playbook. + +To correctly display the order of the installer phases, the `installer_phases` +variable defines the phase or component order. + +```python + # Set the order of the installer phases + installer_phases = [ + 'installer_phase_initialize', + 'installer_phase_etcd', + 'installer_phase_nfs', + 'installer_phase_loadbalancer', + 'installer_phase_master', + 'installer_phase_master_additional', + 'installer_phase_node', + 'installer_phase_glusterfs', + 'installer_phase_hosted', + 'installer_phase_metrics', + 'installer_phase_logging', + 'installer_phase_servicecatalog', + ] +``` + +Additional attributes, such as display title and component playbook, of each +phase are stored in the `phase_attributes` variable. + +```python + # Define the attributes of the installer phases + phase_attributes = { + 'installer_phase_initialize': { + 'title': 'Initialization', + 'playbook': '' + }, + 'installer_phase_etcd': { + 'title': 'etcd Install', + 'playbook': 'playbooks/byo/openshift-etcd/config.yml' + }, + 'installer_phase_nfs': { + 'title': 'NFS Install', + 'playbook': 'playbooks/byo/openshift-nfs/config.yml' + }, + #... + } +``` + +Usage +----- + +In order to indicate the beginning of a component installation, a play must be +added to the beginning of the main playbook for the component to set the phase +status to "In Progress". Additionally, a play must be added after the last play +for that component to set the phase status to "Complete". + +The following example shows the first play of the 'installer phase' loading the +`installer_checkpoint` role, as well as the `set_stats` task for setting +`installer_phase_initialize` to "In Progress". Various plays are run for the +phase/component and then a final play for setting `installer_hase_initialize` to +"Complete". + +```yaml +# common/openshift-cluster/std_include.yml +--- +- name: Initialization Checkpoint Start + hosts: localhost + connection: local + gather_facts: false + roles: + - installer_checkpoint + tasks: + - name: Set install initialization 'In Progress' + set_stats: + data: + installer_phase_initialize: "In Progress" + aggregate: false + +#... +# Various plays here +#... + +- name: Initialization Checkpoint End + hosts: localhost + connection: local + gather_facts: false + tasks: + - name: Set install initialization 'Complete' + set_stats: + data: + installer_phase_initialize: "Complete" + aggregate: false +``` + +Each phase or component of the installer will follow a similar pattern, with the +exception that the `installer_checkpoint` role does not need to be called since +it was already loaded by the play in `std_include.yml`. It is important to +place the 'In Progress' and 'Complete' plays as the first and last plays of the +phase or component. + +Examples +-------- + +Example display of a successful playbook run: + +``` +PLAY RECAP ********************************************************************* +master01.example.com : ok=158 changed=16 unreachable=0 failed=0 +node01.example.com : ok=469 changed=74 unreachable=0 failed=0 +node02.example.com : ok=157 changed=17 unreachable=0 failed=0 +localhost : ok=24 changed=0 unreachable=0 failed=0 + + +INSTALLER STATUS *************************************************************** +Initialization : Complete +etcd Install : Complete +NFS Install : Not Started +Load balancer Install : Not Started +Master Install : Complete +Master Additional Install : Complete +Node Install : Complete +GlusterFS Install : Not Started +Hosted Install : Complete +Metrics Install : Not Started +Logging Install : Not Started +Service Catalog Install : Not Started +``` + +Example display if a failure occurs during execution: + +``` +INSTALLER STATUS *************************************************************** +Initialization : Complete +etcd Install : Complete +NFS Install : Not Started +Load balancer Install : Not Started +Master Install : In Progress + This phase can be restarted by running: playbooks/byo/openshift-master/config.yml +Master Additional Install : Not Started +Node Install : Not Started +GlusterFS Install : Not Started +Hosted Install : Not Started +Metrics Install : Not Started +Logging Install : Not Started +Service Catalog Install : Not Started +``` + +[set_stats]: http://docs.ansible.com/ansible/latest/set_stats_module.html +[openshift_cluster_config]: https://github.com/openshift/openshift-ansible/blob/master/playbooks/common/openshift-cluster/config.yml diff --git a/roles/installer_checkpoint/callback_plugins/installer_checkpoint.py b/roles/installer_checkpoint/callback_plugins/installer_checkpoint.py new file mode 100644 index 000000000..033240e62 --- /dev/null +++ b/roles/installer_checkpoint/callback_plugins/installer_checkpoint.py @@ -0,0 +1,182 @@ +"""Ansible callback plugin to print a summary completion status of installation +phases. +""" +from ansible.plugins.callback import CallbackBase +from ansible import constants as C + +DOCUMENTATION = ''' + +''' + +EXAMPLES = ''' +--------------------------------------------- +Example display of a successful playbook run: + +PLAY RECAP ********************************************************************* +master01.example.com : ok=158 changed=16 unreachable=0 failed=0 +node01.example.com : ok=469 changed=74 unreachable=0 failed=0 +node02.example.com : ok=157 changed=17 unreachable=0 failed=0 +localhost : ok=24 changed=0 unreachable=0 failed=0 + + +INSTALLER STATUS *************************************************************** +Initialization : Complete +etcd Install : Complete +NFS Install : Not Started +Load balancer Install : Not Started +Master Install : Complete +Master Additional Install : Complete +Node Install : Complete +GlusterFS Install : Not Started +Hosted Install : Complete +Metrics Install : Not Started +Logging Install : Not Started +Service Catalog Install : Not Started + +----------------------------------------------------- +Example display if a failure occurs during execution: + +INSTALLER STATUS *************************************************************** +Initialization : Complete +etcd Install : Complete +NFS Install : Not Started +Load balancer Install : Not Started +Master Install : In Progress + This phase can be restarted by running: playbooks/byo/openshift-master/config.yml +Master Additional Install : Not Started +Node Install : Not Started +GlusterFS Install : Not Started +Hosted Install : Not Started +Metrics Install : Not Started +Logging Install : Not Started +Service Catalog Install : Not Started + +''' + + +class CallbackModule(CallbackBase): + """This callback summarizes installation phase status.""" + + CALLBACK_VERSION = 2.0 + CALLBACK_TYPE = 'aggregate' + CALLBACK_NAME = 'installer_checkpoint' + CALLBACK_NEEDS_WHITELIST = False + + def __init__(self): + super(CallbackModule, self).__init__() + + def v2_playbook_on_stats(self, stats): + + # Set the order of the installer phases + installer_phases = [ + 'installer_phase_initialize', + 'installer_phase_etcd', + 'installer_phase_nfs', + 'installer_phase_loadbalancer', + 'installer_phase_master', + 'installer_phase_master_additional', + 'installer_phase_node', + 'installer_phase_glusterfs', + 'installer_phase_hosted', + 'installer_phase_metrics', + 'installer_phase_logging', + 'installer_phase_servicecatalog', + ] + + # Define the attributes of the installer phases + phase_attributes = { + 'installer_phase_initialize': { + 'title': 'Initialization', + 'playbook': '' + }, + 'installer_phase_etcd': { + 'title': 'etcd Install', + 'playbook': 'playbooks/byo/openshift-etcd/config.yml' + }, + 'installer_phase_nfs': { + 'title': 'NFS Install', + 'playbook': 'playbooks/byo/openshift-nfs/config.yml' + }, + 'installer_phase_loadbalancer': { + 'title': 'Load balancer Install', + 'playbook': 'playbooks/byo/openshift-loadbalancer/config.yml' + }, + 'installer_phase_master': { + 'title': 'Master Install', + 'playbook': 'playbooks/byo/openshift-master/config.yml' + }, + 'installer_phase_master_additional': { + 'title': 'Master Additional Install', + 'playbook': 'playbooks/byo/openshift-master/additional_config.yml' + }, + 'installer_phase_node': { + 'title': 'Node Install', + 'playbook': 'playbooks/byo/openshift-node/config.yml' + }, + 'installer_phase_glusterfs': { + 'title': 'GlusterFS Install', + 'playbook': 'playbooks/byo/openshift-glusterfs/config.yml' + }, + 'installer_phase_hosted': { + 'title': 'Hosted Install', + 'playbook': 'playbooks/byo/openshift-cluster/openshift-hosted.yml' + }, + 'installer_phase_metrics': { + 'title': 'Metrics Install', + 'playbook': 'playbooks/byo/openshift-cluster/openshift-metrics.yml' + }, + 'installer_phase_logging': { + 'title': 'Logging Install', + 'playbook': 'playbooks/byo/openshift-cluster/openshift-logging.yml' + }, + 'installer_phase_servicecatalog': { + 'title': 'Service Catalog Install', + 'playbook': 'playbooks/byo/openshift-cluster/service-catalog.yml' + }, + } + + # Find the longest phase title + max_column = 0 + for phase in phase_attributes: + max_column = max(max_column, len(phase_attributes[phase]['title'])) + + if '_run' in stats.custom: + self._display.banner('INSTALLER STATUS') + for phase in installer_phases: + phase_title = phase_attributes[phase]['title'] + padding = max_column - len(phase_title) + 2 + if phase in stats.custom['_run']: + phase_status = stats.custom['_run'][phase] + self._display.display( + '{}{}: {}'.format(phase_title, ' ' * padding, phase_status), + color=self.phase_color(phase_status)) + if phase_status == 'In Progress' and phase != 'installer_phase_initialize': + self._display.display( + '\tThis phase can be restarted by running: {}'.format( + phase_attributes[phase]['playbook'])) + else: + # Phase was not found in custom stats + self._display.display( + '{}{}: {}'.format(phase_title, ' ' * padding, 'Not Started'), + color=C.COLOR_SKIP) + + self._display.display("", screen_only=True) + + def phase_color(self, status): + """ Return color code for installer phase""" + valid_status = [ + 'In Progress', + 'Complete', + ] + + if status not in valid_status: + self._display.warning('Invalid phase status defined: {}'.format(status)) + + if status == 'Complete': + phase_color = C.COLOR_OK + elif status == 'In Progress': + phase_color = C.COLOR_ERROR + else: + phase_color = C.COLOR_WARN + + return phase_color diff --git a/roles/openshift_metrics/tasks/main.yaml b/roles/openshift_metrics/tasks/main.yaml index 0461039fc..c92458c50 100644 --- a/roles/openshift_metrics/tasks/main.yaml +++ b/roles/openshift_metrics/tasks/main.yaml @@ -45,11 +45,11 @@ - include: install_metrics.yaml when: - - openshift_metrics_install_metrics | default(false) | bool + - openshift_metrics_install_metrics | bool - include: uninstall_metrics.yaml when: - - openshift_metrics_uninstall_metrics | default(false) | bool + - openshift_metrics_uninstall_metrics | bool - include: uninstall_hosa.yaml when: not openshift_metrics_install_hawkular_agent | bool diff --git a/roles/openshift_service_catalog/files/openshift-ansible-catalog-console.js b/roles/openshift_service_catalog/files/openshift-ansible-catalog-console.js deleted file mode 100644 index d0a9f11dc..000000000 --- a/roles/openshift_service_catalog/files/openshift-ansible-catalog-console.js +++ /dev/null @@ -1,2 +0,0 @@ -// empty file so that the master-config can still point to a file that exists -// this file will be replaced by the template service broker role if enabled diff --git a/roles/openshift_service_catalog/tasks/wire_aggregator.yml b/roles/openshift_service_catalog/tasks/wire_aggregator.yml deleted file mode 100644 index 300a7db62..000000000 --- a/roles/openshift_service_catalog/tasks/wire_aggregator.yml +++ /dev/null @@ -1,215 +0,0 @@ ---- -- name: Make temp cert dir - command: mktemp -d /tmp/openshift-service-catalog-ansible-XXXXXX - register: certtemp - changed_when: False - -- name: Check for First Master Aggregator Signer cert - stat: - path: /etc/origin/master/front-proxy-ca.crt - register: first_proxy_ca_crt - changed_when: false - delegate_to: "{{ first_master }}" - -- name: Check for First Master Aggregator Signer key - stat: - path: /etc/origin/master/front-proxy-ca.crt - register: first_proxy_ca_key - changed_when: false - delegate_to: "{{ first_master }}" - -# TODO: this currently has a bug where hostnames are required -- name: Creating First Master Aggregator signer certs - command: > - {{ hostvars[first_master].openshift.common.client_binary }} adm ca create-signer-cert - --cert=/etc/origin/master/front-proxy-ca.crt - --key=/etc/origin/master/front-proxy-ca.key - --serial=/etc/origin/master/ca.serial.txt - delegate_to: "{{ first_master }}" - when: - - not first_proxy_ca_crt.stat.exists - - not first_proxy_ca_key.stat.exists - -- name: Check for Aggregator Signer cert - stat: - path: /etc/origin/master/front-proxy-ca.crt - register: proxy_ca_crt - changed_when: false - -- name: Check for Aggregator Signer key - stat: - path: /etc/origin/master/front-proxy-ca.crt - register: proxy_ca_key - changed_when: false - -- name: Copy Aggregator Signer certs from first master - fetch: - src: "/etc/origin/master/{{ item }}" - dest: "{{ certtemp.stdout }}/{{ item }}" - flat: yes - with_items: - - front-proxy-ca.crt - - front-proxy-ca.key - delegate_to: "{{ first_master }}" - when: - - not proxy_ca_key.stat.exists - - not proxy_ca_crt.stat.exists - -- name: Copy Aggregator Signer certs to host - copy: - src: "{{ certtemp.stdout }}/{{ item }}" - dest: "/etc/origin/master/{{ item }}" - with_items: - - front-proxy-ca.crt - - front-proxy-ca.key - when: - - not proxy_ca_key.stat.exists - - not proxy_ca_crt.stat.exists - -# oc_adm_ca_server_cert: -# cert: /etc/origin/master/front-proxy-ca.crt -# key: /etc/origin/master/front-proxy-ca.key - -- name: Check for first master api-client config - stat: - path: /etc/origin/master/aggregator-front-proxy.kubeconfig - register: first_front_proxy_kubeconfig - delegate_to: "{{ first_master }}" - run_once: true - -# create-api-client-config generates a ca.crt file which will -# overwrite the OpenShift CA certificate. Generate the aggregator -# kubeconfig in a temporary directory and then copy files into the -# master config dir to avoid overwriting ca.crt. -- block: - - name: Create first master api-client config for Aggregator - command: > - {{ hostvars[first_master].openshift.common.client_binary }} adm create-api-client-config - --certificate-authority=/etc/origin/master/front-proxy-ca.crt - --signer-cert=/etc/origin/master/front-proxy-ca.crt - --signer-key=/etc/origin/master/front-proxy-ca.key - --user aggregator-front-proxy - --client-dir={{ certtemp.stdout }} - --signer-serial=/etc/origin/master/ca.serial.txt - delegate_to: "{{ first_master }}" - run_once: true - - name: Copy first master api-client config for Aggregator - copy: - src: "{{ certtemp.stdout }}/{{ item }}" - dest: "/etc/origin/master/" - remote_src: true - with_items: - - aggregator-front-proxy.crt - - aggregator-front-proxy.key - - aggregator-front-proxy.kubeconfig - delegate_to: "{{ first_master }}" - run_once: true - when: - - not first_front_proxy_kubeconfig.stat.exists - -- name: Check for api-client config - stat: - path: /etc/origin/master/aggregator-front-proxy.kubeconfig - register: front_proxy_kubeconfig - -- name: Copy api-client config from first master - fetch: - src: "/etc/origin/master/{{ item }}" - dest: "{{ certtemp.stdout }}/{{ item }}" - flat: yes - delegate_to: "{{ first_master }}" - with_items: - - aggregator-front-proxy.crt - - aggregator-front-proxy.key - - aggregator-front-proxy.kubeconfig - when: - - not front_proxy_kubeconfig.stat.exists - -- name: Copy api-client config to host - copy: - src: "{{ certtemp.stdout }}/{{ item }}" - dest: "/etc/origin/master/{{ item }}" - with_items: - - aggregator-front-proxy.crt - - aggregator-front-proxy.key - - aggregator-front-proxy.kubeconfig - when: - - not front_proxy_kubeconfig.stat.exists - -- name: copy tech preview extension file for service console UI - copy: - src: openshift-ansible-catalog-console.js - dest: /etc/origin/master/openshift-ansible-catalog-console.js - -- name: Update master config - yedit: - state: present - src: /etc/origin/master/master-config.yaml - edits: - - key: aggregatorConfig.proxyClientInfo.certFile - value: aggregator-front-proxy.crt - - key: aggregatorConfig.proxyClientInfo.keyFile - value: aggregator-front-proxy.key - - key: authConfig.requestHeader.clientCA - value: front-proxy-ca.crt - - key: authConfig.requestHeader.clientCommonNames - value: [aggregator-front-proxy] - - key: authConfig.requestHeader.usernameHeaders - value: [X-Remote-User] - - key: authConfig.requestHeader.groupHeaders - value: [X-Remote-Group] - - key: authConfig.requestHeader.extraHeaderPrefixes - value: [X-Remote-Extra-] - - key: assetConfig.extensionScripts - value: [/etc/origin/master/openshift-ansible-catalog-console.js] - - key: kubernetesMasterConfig.apiServerArguments.runtime-config - value: [apis/settings.k8s.io/v1alpha1=true] - - key: admissionConfig.pluginConfig.PodPreset.configuration.kind - value: DefaultAdmissionConfig - - key: admissionConfig.pluginConfig.PodPreset.configuration.apiVersion - value: v1 - - key: admissionConfig.pluginConfig.PodPreset.configuration.disable - value: false - register: yedit_output - -#restart master serially here -- name: restart master api - systemd: name={{ openshift.common.service_type }}-master-api state=restarted - when: - - yedit_output.changed - - openshift.master.cluster_method == 'native' - -- name: restart master controllers - systemd: name={{ openshift.common.service_type }}-master-controllers state=restarted - when: - - yedit_output.changed - - openshift.master.cluster_method == 'native' - -- name: Verify API Server - # Using curl here since the uri module requires python-httplib2 and - # wait_for port doesn't provide health information. - command: > - curl --silent --tlsv1.2 - {% if openshift.common.version_gte_3_2_or_1_2 | bool %} - --cacert {{ openshift.common.config_base }}/master/ca-bundle.crt - {% else %} - --cacert {{ openshift.common.config_base }}/master/ca.crt - {% endif %} - {{ openshift.master.api_url }}/healthz/ready - args: - # Disables the following warning: - # Consider using get_url or uri module rather than running curl - warn: no - register: api_available_output - until: api_available_output.stdout == 'ok' - retries: 120 - delay: 1 - changed_when: false - when: - - yedit_output.changed - -- name: Delete temp directory - file: - name: "{{ certtemp.stdout }}" - state: absent - changed_when: False -- cgit v1.2.3