From 430b52b32bb44e05516b0178501cf2878909aaec Mon Sep 17 00:00:00 2001
From: "Suren A. Chilingaryan" <csa@suren.me>
Date: Thu, 20 Aug 2020 04:02:45 +0200
Subject: EPICS namespace and documetation update

---
 docs/performance/gluster.txt                       | 45 ++++++++++++++++++++++
 docs/projects/epics.txt                            |  1 +
 docs/samples/access/externalip.yaml                | 28 ++++++++++++++
 docs/samples/mysql/templates/01-sds-secrets.yml.j2 | 26 +++++++++++++
 docs/samples/mysql/vars/sds.yml                    | 24 ++++++++++++
 docs/troubleshooting.txt                           | 11 ++++++
 docs/vision.txt                                    |  4 +-
 docs/webservices.txt                               |  8 +++-
 8 files changed, 145 insertions(+), 2 deletions(-)
 create mode 100644 docs/performance/gluster.txt
 create mode 100644 docs/projects/epics.txt
 create mode 100644 docs/samples/access/externalip.yaml
 create mode 100644 docs/samples/mysql/templates/01-sds-secrets.yml.j2
 create mode 100644 docs/samples/mysql/vars/sds.yml

(limited to 'docs')

diff --git a/docs/performance/gluster.txt b/docs/performance/gluster.txt
new file mode 100644
index 0000000..ada9a9c
--- /dev/null
+++ b/docs/performance/gluster.txt
@@ -0,0 +1,45 @@
+# Changed (for katrin_data)
+gluster volume set katrin_data server.event-threads 8 # 1
+gluster volume set katrin_data client.event-threads 8 # 2
+gluster volume set katrin_data performance.io-thread-count 32 # 16
+gluster volume set katrin_data cluster.lookup-optimize on # off
+gluster volume set katrin_data cluster.readdir-optimize on # off
+
+# Already set
+performance.stat-prefetch: on
+performance.readdir-ahead: on
+performance.io-cache: on
+cluster.choose-local: true
+performance.flush-behind: on
+performance.write-behind-window-size: 1MB
+
+# Shall we ?
+gluster volume set katrin_data server.outstanding-rpc-limit # 64
+
+# Does this cache (per-client size) makes sence with 1-10s delay?
+gluster volume set katrin_data performance.cache-refresh-timeout 10 # 1 (seconds)
+gluster volume set katrin_data performance.cache-size 1GB # 32MB 
+gluster volume set katrin_data performance.write-behind-window-size # 1MB
+gluster volume set katrin_data performance.cache-max-file-size 2MB # 0 (unlimited)
+
+# Major, but seems to affect only Samba/NFS in 3.x
+gluster volume set katrin_data features.cache-invalidation on # off
+gluster volume set katrin_data performance.cache-invalidation on # off
+gluster volume set katrin_data features.cache-invalidation-timeout 600 # 60
+gluster volume set katrin_data performance.md-cache-timeout 600 # 1
+
+# Not recommended
+gluster volume set katrin_data performance.client-io-threads on # off
+
+
+Systemctl (not applied as we use rdma anyway)
+=========
+sysctl -w net.ipv4.tcp_congestion_control=htcp # cubic
+sysctl -w net.ipv4.tcp_mtu_probing=1				# recommended for hosts with jumbo frames enabled
+
+# Optimal value unclear (big value may harm small file performance)
+sysctl -w net.ipv4.tcp_rmem = 4096 87380 33554432		# increase Linux autotuning TCP buffer limit to 32MB
+sysctl -w net.ipv4.tcp_wmem = 4096 87380 33554432
+sysctl -w net.core.rmem_max = 67108864				# allow testing with buffers up to 64MB
+sysctl -w net.core.wmem_max = 67108864
+sysctl -w net.core.netdev_max_backlog = 30000			# increase the length of the processor input queue
diff --git a/docs/projects/epics.txt b/docs/projects/epics.txt
new file mode 100644
index 0000000..6190dbd
--- /dev/null
+++ b/docs/projects/epics.txt
@@ -0,0 +1 @@
+EPICS_CA_ADDR_LIST="172.30.14.13" caget -w 3 -t darwin:ist:ts1
diff --git a/docs/samples/access/externalip.yaml b/docs/samples/access/externalip.yaml
new file mode 100644
index 0000000..3827968
--- /dev/null
+++ b/docs/samples/access/externalip.yaml
@@ -0,0 +1,28 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: epics-softioc-external
+  namespace: epics
+spec:
+  type: ClusterIP
+  selector:
+    deploymentconfig: epics-softioc
+  externalIPs:
+  - 192.168.130.1
+  ports:
+  - name: 5064-tcp
+    port: 5064
+    protocol: TCP
+    targetPort: 5064
+  - name: 5064-udp
+    port: 5064
+    protocol: UDP
+    targetPort: 5064
+  - name: 5065-tcp
+    port: 5065
+    protocol: TCP
+    targetPort: 5065
+  - name: 5065-udp
+    port: 5065
+    protocol: UDP
+    targetPort: 5065
diff --git a/docs/samples/mysql/templates/01-sds-secrets.yml.j2 b/docs/samples/mysql/templates/01-sds-secrets.yml.j2
new file mode 100644
index 0000000..2922118
--- /dev/null
+++ b/docs/samples/mysql/templates/01-sds-secrets.yml.j2
@@ -0,0 +1,26 @@
+apiVersion: v1
+kind: Template
+metadata:
+  name: sds-secrets
+  labels:
+    app: sds
+  annotations:
+    descriptions: "SymmetricDS Secrets"
+objects:
+- apiVersion: v1
+  kind: Secret
+  metadata:
+    annotations:
+      template.openshift.io/expose-root_password: '{.data[''root-password'']}'
+      template.openshift.io/expose-database_password: '{.data[''database-password'']}'
+    name: sds
+  stringData:
+    root-password: "${DATABASE_PASSWORD}"
+    database-password: "${DATABASE_PASSWORD}"
+parameters:
+- description: SymmetricDS Database Password
+  displayName: SymmetricDS Database Password
+  from: '[a-zA-Z0-9]{16}'
+  generate: expression
+  name: DATABASE_PASSWORD
+  required: true
diff --git a/docs/samples/mysql/vars/sds.yml b/docs/samples/mysql/vars/sds.yml
new file mode 100644
index 0000000..abe0f4f
--- /dev/null
+++ b/docs/samples/mysql/vars/sds.yml
@@ -0,0 +1,24 @@
+sds:
+  pods:
+    sds-mysql:
+      service: { ports: [ 3306 ] }
+      sched: { replicas: 1, strategy: "Recreate" }
+      groups: [ "services_sds" ]
+      images:
+        - stream: "openshift/mysql:5.7"
+          env:
+            - { name: "MYSQL_USER", value: "sds" }
+            - { name: "MYSQL_PASSWORD", value: "secret@sds/database-password" } 
+            - { name: "MYSQL_ROOT_PASSWORD", value: "secret@sds/root-password" } 
+            - { name: "MYSQL_DATABASE", value: "sds" }
+            - { name: "MYSQL_MAX_CONNECTIONS", value: "50" }
+          mappings: 
+            - { name: "db", path: "sds", mount: "/var/lib/mysql/data" }
+          resources: { limit: { cpu: 1000m, mem: 2Gi } }
+#          probes:
+#            - { port: 3306 }
+          probes:
+            - { type: "liveness", port: 3306 }
+            - { type: "readiness", command: [ /bin/sh, -i, -c, MYSQL_PWD="$MYSQL_PASSWORD" mysql -h 127.0.0.1 -u $MYSQL_USER -D $MYSQL_DATABASE -e 'SELECT 1' ], delay: "15", timeout: "5" }
+          hooks:
+            - { type: "postStart", command: [ /bin/sh, -i, -c, sleep 10; MYSQL_PWD="$MYSQL_ROOT_PASSWORD" mysql -h 127.0.0.1 -u root -D $MYSQL_DATABASE -e "GRANT ALL ON *.* TO 'sds'@'%'; UPDATE mysql.user SET Super_Priv='Y' WHERE user='sds' AND host='%'; FLUSH PRIVILEGES;" ] }
diff --git a/docs/troubleshooting.txt b/docs/troubleshooting.txt
index 1f52fe9..5eb0cc7 100644
--- a/docs/troubleshooting.txt
+++ b/docs/troubleshooting.txt
@@ -263,6 +263,17 @@ pods: very slow scheduling (normal start time in seconds range), failed pods, ro
  and the pods should be allowed to access files. Possible errors:
     unable to create pods: pods "mongodb-2-" is forbidden: no providers available to validate pod request
 
+Pod Networking
+==============
+- Run commands in pod network, particularly execute packet sniffers (which would not work in the container due to missing capabilities)
+  * Get container cid
+    docker ps -f label=io.kubernetes.pod.name=epics-archappl-46-h6j62 -f label=io.kubernetes.pod.namespace=epics -f label=io.kubernetes.docker.type=podsandbox -q
+  * Run command with container networking, e.g. tcpdump
+    nsenter -n -t $(docker inspect --format "{{ .State.Pid }}" "f5a0ad4f5793") tcpdump -nv -i eth0
+
+- Check if service properly exposed ports
+    * 'nc' will not properly tell if UDP port is open or not due to underlaying firewall.
+    iptables -n -L -t nat  | grep 5064 | grep 172
 
 
 Builds
diff --git a/docs/vision.txt b/docs/vision.txt
index bf6de57..fdc921d 100644
--- a/docs/vision.txt
+++ b/docs/vision.txt
@@ -2,12 +2,14 @@ Ands v.2
 ========
  - Try overlay2 storage driver (LVM is used in Ands v.1). Check also further docker configuration options: 'cgroup-driver', ...
     * This actually seems problematic in CentOS-8. Something, like 'rsync portage portage/.tmp' is EXREMELY slow (<1 MB/s). Just check eix-sync.
- - Integrate fast Ethernet and use conteiner native networking. OpenVSwitch is slow and causes problems.
+ - Integrate fast Ethernet and use conteiner native networking. OpenVSwitch is slow and causes problems. Alternatively, can we rely on some hardware
+   features of novel network cards, e.g. Mellanox ASAP2 (Accelerated Switch and Packet Processing)
  - Do not run pods on Master nodes, but Gluster and a few databases pods (MySQL) are OK (multiple reasons, especially mounting a lot of Gluster Volumes)
     * Restrict all periodic jobs to a specific node: easy to re-install (non-master), fast SSD storage, ...?
  - Object Storage should be integrated, either Gluster Block is ready for production or we have to use Ceph as well
  - Automatic provisioning would be much better then handling volumes trough Ands. Basically, this will render Ands redundant. We can switch to Helm, etc.
    But, we need ability to easily understand which volume belong to which pod/namespace and automatically kill redundant volumes. 
+ - Avoid conflicts with SCC private vlans (KIT WiFi, VPN, ...?)
 
 Questions
 =========
diff --git a/docs/webservices.txt b/docs/webservices.txt
index 2545bd5..0edfdeb 100644
--- a/docs/webservices.txt
+++ b/docs/webservices.txt
@@ -10,12 +10,18 @@ Architecture
     by setting 'haproxy.router.openshift.io/balance' to 'source' in route metadata. Then, the destination
     replica will be determined based on the client IP.
     * HAProxy has configured a default timeout. If replica does not send data within '30s' the connection
-    will be terminated. It can be increased with 'haproxy.router.openshift.io/timeout'
+    will be terminated. It can be increased with 'haproxy.router.openshift.io/timeout' in route metadata.
     * There is a several ways to configure certiciates for HTTPS services defined by type of tls termination
     in the route specification. With 'passthrough' the container is expected to handle certificates itself.
     In the edge termination mode, the certificates are configured in the route and HAProxy manages secure
     communication with clients and provides unencrypted data to the service in the cluster.
 
+ - Sample metadata configuration for route:
+        kind: Route
+        metadata:
+          annotations:
+            haproxy.router.openshift.io/balance: 'source'
+            haproxy.router.openshift.io/timeout: 300s
 
 Updating/Generating certificates for the router
 ===============================================
-- 
cgit v1.2.3