From 1fb49d8220cad74d7221863754c919e3195a3d37 Mon Sep 17 00:00:00 2001 From: "Suren A. Chilingaryan" Date: Mon, 7 Dec 2020 15:17:04 +0100 Subject: A bit of troubleshooting docs and more LSDF samples --- docs/samples/remote_storage/lsdf/lsdf-ips-test.yml | 44 ++++++++++++++++++++++ docs/samples/remote_storage/lsdf/lsdf-ips.yml | 38 +++++++++++++++++++ .../remote_storage/lsdf/lsdf-katrin-test.yml | 44 ++++++++++++++++++++++ docs/samples/remote_storage/lsdf/lsdf-katrin.yml | 38 +++++++++++++++++++ docs/samples/remote_storage/lsdf/lsdf-test.yml | 44 ---------------------- docs/samples/remote_storage/lsdf/lsdf.yml | 38 ------------------- docs/troubleshooting.txt | 33 ++++++++++++++-- 7 files changed, 194 insertions(+), 85 deletions(-) create mode 100644 docs/samples/remote_storage/lsdf/lsdf-ips-test.yml create mode 100644 docs/samples/remote_storage/lsdf/lsdf-ips.yml create mode 100644 docs/samples/remote_storage/lsdf/lsdf-katrin-test.yml create mode 100644 docs/samples/remote_storage/lsdf/lsdf-katrin.yml delete mode 100644 docs/samples/remote_storage/lsdf/lsdf-test.yml delete mode 100644 docs/samples/remote_storage/lsdf/lsdf.yml diff --git a/docs/samples/remote_storage/lsdf/lsdf-ips-test.yml b/docs/samples/remote_storage/lsdf/lsdf-ips-test.yml new file mode 100644 index 0000000..c02a888 --- /dev/null +++ b/docs/samples/remote_storage/lsdf/lsdf-ips-test.yml @@ -0,0 +1,44 @@ +apiVersion: template.openshift.io/v1 +kind: Template +metadata: + annotations: + name: lsdf-test +objects: + - apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: lsdf-ips + spec: + volumeName: lsdf-ips + accessModes: + - ReadWriteOnce + resources: + requests: + storage: "10Ti" + - apiVersion: v1 + kind: DeploymentConfig + metadata: + name: lsdf-test + spec: + replicas: 1 + selector: + name: lsdf-test + strategy: + type: Recreate + template: + metadata: + labels: + name: lsdf-test + name: lsdf-test + spec: + volumes: + - name: lsdf-ips + persistentVolumeClaim: + claimName: lsdf-ips + containers: + - image: registry.access.redhat.com/rhel7/rhel:7.6 + name: cifs-app + command: ["/bin/bash", "-c", "while true; do sleep 10; done"] + volumeMounts: + - name: lsdf-ips + mountPath: /mnt/lsdf diff --git a/docs/samples/remote_storage/lsdf/lsdf-ips.yml b/docs/samples/remote_storage/lsdf/lsdf-ips.yml new file mode 100644 index 0000000..c091870 --- /dev/null +++ b/docs/samples/remote_storage/lsdf/lsdf-ips.yml @@ -0,0 +1,38 @@ +apiVersion: v1 +kind: Template +metadata: + name: lsdf + annotations: + descriptions: "LSDF volumes" +objects: + - apiVersion: v1 + kind: Secret + metadata: + name: lsdf-csa + type: openshift.io/cifs + stringData: + username: "gf6501" + password: "" + - apiVersion: v1 + kind: PersistentVolume + metadata: + name: lsdf-ips + spec: + persistentVolumeReclaimPolicy: Retain + accessModes: + - ReadWriteMany + capacity: + storage: 100Ti + claimRef: + name: lsdf-ips + namespace: test + flexVolume: + driver: openshift.io/cifs + fsType: cifs + secretRef: + name: lsdf-csa + options: + networkPath: "//os.lsdf.kit.edu/ips-projects" + mountOptions: "vers=2.0,domain=os.lsdf.kit.edu,file_mode=0664,dir_mode=0775" + + \ No newline at end of file diff --git a/docs/samples/remote_storage/lsdf/lsdf-katrin-test.yml b/docs/samples/remote_storage/lsdf/lsdf-katrin-test.yml new file mode 100644 index 0000000..003da6c --- /dev/null +++ b/docs/samples/remote_storage/lsdf/lsdf-katrin-test.yml @@ -0,0 +1,44 @@ +apiVersion: template.openshift.io/v1 +kind: Template +metadata: + annotations: + name: lsdf-katrin-test +objects: + - apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: lsdf-katrin-kdb + spec: + volumeName: lsdf-katrin-kdb + accessModes: + - ReadWriteOnce + resources: + requests: + storage: "100Ti" + - apiVersion: v1 + kind: DeploymentConfig + metadata: + name: lsdf-katrin-test + spec: + replicas: 1 + selector: + name: lsdf-katrin-test + strategy: + type: Recreate + template: + metadata: + labels: + name: lsdf-katrin-test + name: lsdf-katrin-test + spec: + volumes: + - name: lsdf-katrin-kdb + persistentVolumeClaim: + claimName: lsdf-katrin-kdb + containers: + - image: registry.access.redhat.com/rhel7/rhel:7.6 + name: cifs-app + command: ["/bin/bash", "-c", "while true; do sleep 10; done"] + volumeMounts: + - name: lsdf-katrin-kdb + mountPath: /mnt/kdb diff --git a/docs/samples/remote_storage/lsdf/lsdf-katrin.yml b/docs/samples/remote_storage/lsdf/lsdf-katrin.yml new file mode 100644 index 0000000..036e9a5 --- /dev/null +++ b/docs/samples/remote_storage/lsdf/lsdf-katrin.yml @@ -0,0 +1,38 @@ +apiVersion: v1 +kind: Template +metadata: + name: lsdf-katrin + annotations: + descriptions: "LSDF volumes" +objects: + - apiVersion: v1 + kind: Secret + metadata: + name: lsdf-katrin + type: openshift.io/cifs + stringData: + username: "ipe-katrinlsdf-0001" + password: "6?)CW5wE3/" + - apiVersion: v1 + kind: PersistentVolume + metadata: + name: lsdf-katrin-kdb + spec: + persistentVolumeReclaimPolicy: Retain + accessModes: + - ReadWriteMany + capacity: + storage: 100Ti + claimRef: + name: lsdf-katrin-kdb + namespace: katrin + flexVolume: + driver: openshift.io/cifs + fsType: cifs + secretRef: + name: lsdf-katrin + options: + networkPath: "//os.lsdf.kit.edu/ipe-projects/katrin_kdb" + mountOptions: "vers=2.0,domain=os.lsdf.kit.edu,file_mode=0664,dir_mode=0775" + + \ No newline at end of file diff --git a/docs/samples/remote_storage/lsdf/lsdf-test.yml b/docs/samples/remote_storage/lsdf/lsdf-test.yml deleted file mode 100644 index c02a888..0000000 --- a/docs/samples/remote_storage/lsdf/lsdf-test.yml +++ /dev/null @@ -1,44 +0,0 @@ -apiVersion: template.openshift.io/v1 -kind: Template -metadata: - annotations: - name: lsdf-test -objects: - - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: lsdf-ips - spec: - volumeName: lsdf-ips - accessModes: - - ReadWriteOnce - resources: - requests: - storage: "10Ti" - - apiVersion: v1 - kind: DeploymentConfig - metadata: - name: lsdf-test - spec: - replicas: 1 - selector: - name: lsdf-test - strategy: - type: Recreate - template: - metadata: - labels: - name: lsdf-test - name: lsdf-test - spec: - volumes: - - name: lsdf-ips - persistentVolumeClaim: - claimName: lsdf-ips - containers: - - image: registry.access.redhat.com/rhel7/rhel:7.6 - name: cifs-app - command: ["/bin/bash", "-c", "while true; do sleep 10; done"] - volumeMounts: - - name: lsdf-ips - mountPath: /mnt/lsdf diff --git a/docs/samples/remote_storage/lsdf/lsdf.yml b/docs/samples/remote_storage/lsdf/lsdf.yml deleted file mode 100644 index c091870..0000000 --- a/docs/samples/remote_storage/lsdf/lsdf.yml +++ /dev/null @@ -1,38 +0,0 @@ -apiVersion: v1 -kind: Template -metadata: - name: lsdf - annotations: - descriptions: "LSDF volumes" -objects: - - apiVersion: v1 - kind: Secret - metadata: - name: lsdf-csa - type: openshift.io/cifs - stringData: - username: "gf6501" - password: "" - - apiVersion: v1 - kind: PersistentVolume - metadata: - name: lsdf-ips - spec: - persistentVolumeReclaimPolicy: Retain - accessModes: - - ReadWriteMany - capacity: - storage: 100Ti - claimRef: - name: lsdf-ips - namespace: test - flexVolume: - driver: openshift.io/cifs - fsType: cifs - secretRef: - name: lsdf-csa - options: - networkPath: "//os.lsdf.kit.edu/ips-projects" - mountOptions: "vers=2.0,domain=os.lsdf.kit.edu,file_mode=0664,dir_mode=0775" - - \ No newline at end of file diff --git a/docs/troubleshooting.txt b/docs/troubleshooting.txt index 5eb0cc7..459143e 100644 --- a/docs/troubleshooting.txt +++ b/docs/troubleshooting.txt @@ -28,9 +28,9 @@ The services has to be running Pods has to be running ---------------------- - Kubernetes System + Kubernetes System - Integration with public cloud resources as it seems - kube-service-catalog/apiserver - - kube-service-catalog/controller-manager + - kube-service-catalog/controller-manager - this seems optional OpenShift Main Services - default/docker-registry @@ -39,7 +39,7 @@ Pods has to be running - openshift-template-service-broker/api-server (daemonset, on all nodes) OpenShift Secondary Services - - openshift-ansible-service-broker/asb + - openshift-ansible-service-broker/asb - this is optional - openshift-ansible-service-broker/asb-etcd GlusterFS @@ -132,6 +132,25 @@ etcd (and general operability) certificate verification code which introduced in etcd 3.2. There are multiple bug repports on the issue. +services +======== + - kube-service-catalog/controller-manager might stuck in CrashLoopBackOff. It seems doesn't matter in current setup. + * The problem is expired certificate of kube-service-catalog/apiserver. This can be checked with + curl 'https://172.30.183.21:443/apis/servicecatalog.k8s.io/v1beta1 + * The certificates are located in '/etc/origin/service-catalog' and can be verified. + * There is possibly a way to renew it. However, while prototyping the cluster, it got severely broken on each time + upgrade was executed. The new certificate in 'service-catalog' was one of very few things which actually changed + in the upgrade. Therefore, it might be dangerous to replace it. + * On other hand, it seems no missin services in the current configuration + +nodes: domino failures +===== + - If OpenShift cluster is overloaded, we might get a domino failures if a single node goes off (even temporarily disconnected, e.g. due to restart of origin-node) and all pods + are rescheduled to oterh nodes of the cluster. + * Increased load, then, may trigger some other nodes offline (for a short while) and cause all nodes to be rescheduled from them as well. + * This might continue infinitely as one node is gets disconnected after another, pods get rescheduled, and process never stops + * The only solution is to remove temporarily some pods, e.g. ADEI pods could be easily removed and, then, provivisioned back + pods: very slow scheduling (normal start time in seconds range), failed pods, rogue namespaces, etc... ==== - OpenShift has numerous problems with clean-up resources after the pods. The problems are more likely to happen on the @@ -287,6 +306,14 @@ Storage ======= - The offline bricks can be brough back into the service with the follwoing command gluster volume start openshift force + If this doesn't help, the volume should be stopped and started again + gluster volume stop openshift + gluster volume start openshift + + This might cause problems to the services. Likely pods will continue to run, but will be + not be able to access mounted volumes. Particularly, adei-frontends/adei-cachers are affected. + So, this services have to be restarted manually in some cases. + - Running a lot of pods may exhaust available storage. It worth checking if * There is enough Docker storage for containers (lvm) -- cgit v1.2.3