diff options
Diffstat (limited to 'roles')
| -rw-r--r-- | roles/etcd_migrate/README.md | 53 | ||||
| -rw-r--r-- | roles/etcd_migrate/defaults/main.yml | 3 | ||||
| -rw-r--r-- | roles/etcd_migrate/meta/main.yml | 17 | ||||
| -rw-r--r-- | roles/etcd_migrate/tasks/check.yml | 55 | ||||
| -rw-r--r-- | roles/etcd_migrate/tasks/check_cluster_health.yml | 23 | ||||
| -rw-r--r-- | roles/etcd_migrate/tasks/check_cluster_status.yml | 32 | ||||
| -rw-r--r-- | roles/etcd_migrate/tasks/configure.yml | 13 | ||||
| -rw-r--r-- | roles/etcd_migrate/tasks/main.yml | 25 | ||||
| -rw-r--r-- | roles/etcd_migrate/tasks/migrate.yml | 53 | 
9 files changed, 274 insertions, 0 deletions
| diff --git a/roles/etcd_migrate/README.md b/roles/etcd_migrate/README.md new file mode 100644 index 000000000..369e78ff2 --- /dev/null +++ b/roles/etcd_migrate/README.md @@ -0,0 +1,53 @@ +Role Name +========= + +Offline etcd migration of data from v2 to v3 + +Requirements +------------ + +It is expected all consumers of the etcd data are not accessing the data. +Otherwise the migrated data can be out-of-sync with the v2 and can result in unhealthy etcd cluster. + +The role itself is responsible for: +- checking etcd cluster health and raft status before the migration +- checking of presence of any v3 data (in that case the migration is stopped) +- migration of v2 data to v3 data (including attaching leases of keys prefixed with "/kubernetes.io/events" and "/kubernetes.io/masterleases" string) +- validation of migrated data (all v2 keys and in v3 keys and are set to the identical value) + +The migration itself requires an etcd member to be down in the process. Once the migration is done, the etcd member is started. + +Role Variables +-------------- + +TBD + +Dependencies +------------ + +- etcd_common +- lib_utils + +Example Playbook +---------------- + +```yaml +- name: Migrate etcd data from v2 to v3 +  hosts: oo_etcd_to_config +  gather_facts: no +  tasks: +  - include_role: +      name: openshift_etcd_migrate +    vars: +      etcd_peer: "{{ ansible_default_ipv4.address }}" +``` + +License +------- + +Apache License, Version 2.0 + +Author Information +------------------ + +Jan Chaloupka (jchaloup@redhat.com) diff --git a/roles/etcd_migrate/defaults/main.yml b/roles/etcd_migrate/defaults/main.yml new file mode 100644 index 000000000..05cf41fbb --- /dev/null +++ b/roles/etcd_migrate/defaults/main.yml @@ -0,0 +1,3 @@ +--- +# Default action when calling this role, choices: check, migrate, configure +r_etcd_migrate_action: migrate diff --git a/roles/etcd_migrate/meta/main.yml b/roles/etcd_migrate/meta/main.yml new file mode 100644 index 000000000..f3cabbef6 --- /dev/null +++ b/roles/etcd_migrate/meta/main.yml @@ -0,0 +1,17 @@ +--- +galaxy_info: +  author: Jan Chaloupka +  description: Etcd migration +  company: Red Hat, Inc. +  license: Apache License, Version 2.0 +  min_ansible_version: 2.1 +  platforms: +  - name: EL +    versions: +    - 7 +  categories: +  - cloud +  - system +dependencies: +- { role: etcd_common } +- { role: lib_utils } diff --git a/roles/etcd_migrate/tasks/check.yml b/roles/etcd_migrate/tasks/check.yml new file mode 100644 index 000000000..2f07713bc --- /dev/null +++ b/roles/etcd_migrate/tasks/check.yml @@ -0,0 +1,55 @@ +--- +# Check the cluster is healthy +- include: check_cluster_health.yml + +# Check if the member has v3 data already +# Run the migration only if the data are v2 +- name: Check if there are any v3 data +  command: > +    etcdctl --cert {{ etcd_peer_cert_file }} --key {{ etcd_peer_key_file }} --cacert {{ etcd_peer_ca_file }} --endpoints 'https://{{ etcd_peer }}:2379' get "" --from-key --keys-only -w json --limit 1 +  environment: +    ETCDCTL_API: 3 +  register: l_etcdctl_output + +- fail: +    msg: "Unable to get a number of v3 keys" +  when: l_etcdctl_output.rc != 0 + +- fail: +    msg: "The etcd has at least one v3 key" +  when: "'count' in (l_etcdctl_output.stdout | from_json) and (l_etcdctl_output.stdout | from_json).count != 0" + + +# TODO(jchaloup): once the until loop can be used over include/block, +#                 remove the repetive code +# - until loop not supported over include statement (nor block) +#   https://github.com/ansible/ansible/issues/17098 +# - with_items not supported over block + +# Check the cluster status for the first time +- include: check_cluster_status.yml + +# Check the cluster status for the second time +- block: +  - debug: +      msg: "l_etcd_cluster_status_ok: {{ l_etcd_cluster_status_ok }}" +  - name: Wait a while before another check +    pause: +      seconds: 5 +    when: not l_etcd_cluster_status_ok | bool + +  - include: check_cluster_status.yml +    when: not l_etcd_cluster_status_ok | bool + + +# Check the cluster status for the third time +- block: +  - debug: +      msg: "l_etcd_cluster_status_ok: {{ l_etcd_cluster_status_ok }}" +  - name: Wait a while before another check +    pause: +      seconds: 5 +    when: not l_etcd_cluster_status_ok | bool + +  - include: check_cluster_status.yml +    when: not l_etcd_cluster_status_ok | bool diff --git a/roles/etcd_migrate/tasks/check_cluster_health.yml b/roles/etcd_migrate/tasks/check_cluster_health.yml new file mode 100644 index 000000000..1abd6a32f --- /dev/null +++ b/roles/etcd_migrate/tasks/check_cluster_health.yml @@ -0,0 +1,23 @@ +--- +- name: Check cluster health +  command: > +    etcdctl --cert-file /etc/etcd/peer.crt --key-file /etc/etcd/peer.key --ca-file /etc/etcd/ca.crt --endpoint https://{{ etcd_peer }}:2379 cluster-health +  register: etcd_cluster_health +  changed_when: false +  failed_when: false + +- name: Assume a member is not healthy +  set_fact: +    etcd_member_healthy: false + +- name: Get member item health status +  set_fact: +    etcd_member_healthy: true +  with_items: "{{ etcd_cluster_health.stdout_lines }}" +  when: "(etcd_peer in item) and ('is healthy' in item)" + +- name: Check the etcd cluster health +  # TODO(jchaloup): should we fail or ask user if he wants to continue? Or just wait until the cluster is healthy? +  fail: +    msg: "Etcd member {{ etcd_peer }} is not healthy" +  when: not etcd_member_healthy diff --git a/roles/etcd_migrate/tasks/check_cluster_status.yml b/roles/etcd_migrate/tasks/check_cluster_status.yml new file mode 100644 index 000000000..90fe385c1 --- /dev/null +++ b/roles/etcd_migrate/tasks/check_cluster_status.yml @@ -0,0 +1,32 @@ +--- +# etcd_ip originates from etcd_common role +- name: Check cluster status +  command: > +    etcdctl --cert /etc/etcd/peer.crt --key /etc/etcd/peer.key --cacert /etc/etcd/ca.crt --endpoints 'https://{{ etcd_peer }}:2379' -w json endpoint status +  environment: +    ETCDCTL_API: 3 +  register: l_etcd_cluster_status + +- name: Retrieve raftIndex +  set_fact: +    etcd_member_raft_index: "{{ (l_etcd_cluster_status.stdout | from_json)[0]['Status']['raftIndex'] }}" + +- block: +  # http://docs.ansible.com/ansible/playbooks_filters.html#extracting-values-from-containers +  - name: Group all raftIndices into a list +    set_fact: +      etcd_members_raft_indices: "{{ groups['oo_etcd_to_config'] | map('extract', hostvars, 'etcd_member_raft_index') | list | unique }}" + +  - name: Check the minimum and the maximum of raftIndices is at most 1 +    set_fact: +      etcd_members_raft_indices_diff: "{{ ((etcd_members_raft_indices | max | int) - (etcd_members_raft_indices | min | int)) | int }}" + +  - debug: +      msg: "Raft indices difference: {{ etcd_members_raft_indices_diff }}" + +  when: inventory_hostname in groups.oo_etcd_to_config[0] + +# The cluster raft status is ok if the difference of the max and min raft index is at most 1 +- name: capture the status +  set_fact: +    l_etcd_cluster_status_ok: "{{ hostvars[groups.oo_etcd_to_config[0]]['etcd_members_raft_indices_diff'] | int < 2 }}" diff --git a/roles/etcd_migrate/tasks/configure.yml b/roles/etcd_migrate/tasks/configure.yml new file mode 100644 index 000000000..a305d5bf3 --- /dev/null +++ b/roles/etcd_migrate/tasks/configure.yml @@ -0,0 +1,13 @@ +--- +- name: Configure master to use etcd3 storage backend +  yedit: +    src: /etc/origin/master/master-config.yaml +    key: "{{ item.key }}" +    value: "{{ item.value }}" +  with_items: +    - key: kubernetesMasterConfig.apiServerArguments.storage-backend +      value: +        - etcd3 +    - key: kubernetesMasterConfig.apiServerArguments.storage-media-type +      value: +        - application/vnd.kubernetes.protobuf diff --git a/roles/etcd_migrate/tasks/main.yml b/roles/etcd_migrate/tasks/main.yml new file mode 100644 index 000000000..409b0b613 --- /dev/null +++ b/roles/etcd_migrate/tasks/main.yml @@ -0,0 +1,25 @@ +--- +- name: Fail if invalid r_etcd_migrate_action provided +  fail: +    msg: "etcd_migrate role can only be called with 'check' or 'migrate' or 'configure'" +  when: r_etcd_migrate_action not in ['check', 'migrate', 'configure'] + +- name: Include main action task file +  include: "{{ r_etcd_migrate_action }}.yml" + +# 2. migrate v2 datadir into v3: +#   ETCDCTL_API=3 ./etcdctl migrate  --data-dir=${data_dir} --no-ttl +#   backup the etcd datadir first +#   Provide a way for an operator to specify transformer + +# 3. re-configure OpenShift master at /etc/origin/master/master-config.yml +#   set storage-backend to “etcd3” +# 4. we could leave the master restart to current logic (there is already the code ready (single vs. HA master)) + +# Run +# etcdctl --cert-file /etc/etcd/peer.crt --key-file /etc/etcd/peer.key --ca-file /etc/etcd/ca.crt --endpoint https://172.16.186.45:2379 cluster-health +# to check the cluster health (from the etcdctl.sh aliases file) + +# Another assumption: +# - in order to migrate all etcd v2 data into v3, we need to shut down the cluster (let's verify that on Wednesday meeting) +# - diff --git a/roles/etcd_migrate/tasks/migrate.yml b/roles/etcd_migrate/tasks/migrate.yml new file mode 100644 index 000000000..cb479b0cc --- /dev/null +++ b/roles/etcd_migrate/tasks/migrate.yml @@ -0,0 +1,53 @@ +--- +# Should this be run in a serial manner? +- set_fact: +    l_etcd_service: "{{ 'etcd_container' if openshift.common.is_containerized else 'etcd' }}" + +- name: Disable etcd members +  service: +    name: "{{ l_etcd_service }}" +    state: stopped + +# Should we skip all TTL keys? https://bugzilla.redhat.com/show_bug.cgi?id=1389773 +- name: Migrate etcd data +  command: > +    etcdctl migrate --data-dir={{ etcd_data_dir }} +  environment: +    ETCDCTL_API: 3 +  register: l_etcdctl_migrate + +# TODO(jchaloup): If any of the members fails, we need to restore all members to v2 from the pre-migrate backup +- name: Check the etcd v2 data are correctly migrated +  fail: +    msg: "Failed to migrate a member" +  when: "'finished transforming keys' not in l_etcdctl_migrate.stdout" + +# TODO(jchaloup): start the etcd on a different port so noone can access it +# Once the validation is done +- name: Enable etcd member +  service: +    name: "{{ l_etcd_service }}" +    state: started + +- name: Re-introduce leases (as a replacement for key TTLs) +  command: > +    oadm migrate etcd-ttl \ +    --cert {{ etcd_peer_cert_file }} \ +    --key {{ etcd_peer_key_file }} \ +    --cacert {{ etcd_peer_ca_file }} \ +    --etcd-address 'https://{{ etcd_peer }}:2379' \ +    --ttl-keys-prefix {{ item }} \ +    --lease-duration 1h +  environment: +    ETCDCTL_API: 3 +  with_items: +  - "/kubernetes.io/events" +  - "/kubernetes.io/masterleases" + +- set_fact: +    r_etcd_migrate_success: true + +- name: Enable etcd member +  service: +    name: "{{ l_etcd_service }}" +    state: started | 
