From 43ad15a2eb651b7e0e8dcd9a57143d4830b213d4 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sat, 12 Oct 2024 18:40:03 -0400 Subject: [PATCH 01/14] Increased memory and CPU values. We're hitting a bunch of limits on Oct 12, but I'm not sure whether this is because of high demand, or because of some other issue we're running into. --- helm/name-lookup/values.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/helm/name-lookup/values.yaml b/helm/name-lookup/values.yaml index 4facbc7d..3765bf68 100644 --- a/helm/name-lookup/values.yaml +++ b/helm/name-lookup/values.yaml @@ -99,11 +99,11 @@ app: tolerations: resources: requests: - memory: "300M" - cpu: 250m - limits: memory: "512M" cpu: 500m + limits: + memory: "1G" + cpu: 1000m nameOverride: "" fullnameOverride: "" From 5d31cf96ce8d1f1b617eb22f76c50c6e99cbfeb3 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 14 Oct 2024 23:15:29 -0400 Subject: [PATCH 02/14] Added a readiness probe and increased webServer replicas. --- helm/name-lookup/templates/web-deployment.yaml | 6 ++++++ helm/name-lookup/values.yaml | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/helm/name-lookup/templates/web-deployment.yaml b/helm/name-lookup/templates/web-deployment.yaml index 094d24cd..9dc2005c 100644 --- a/helm/name-lookup/templates/web-deployment.yaml +++ b/helm/name-lookup/templates/web-deployment.yaml @@ -48,6 +48,12 @@ spec: resources: {{- toYaml . | nindent 12 }} {{- end }} + readinessProbe: + httpGet: + port: {{ .Values.webServer.port }} + path: /status + initialDelaySeconds: 120 + periodSeconds: 30 {{- with .Values.app.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} diff --git a/helm/name-lookup/values.yaml b/helm/name-lookup/values.yaml index 3765bf68..8598a061 100644 --- a/helm/name-lookup/values.yaml +++ b/helm/name-lookup/values.yaml @@ -5,7 +5,7 @@ webServer: - replicaCount: 1 + replicaCount: 3 service: port: 2433 type: ClusterIP From 82c814cde4ed4c33216fdb88660760b0f7dce2e9 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 14 Oct 2024 23:18:41 -0400 Subject: [PATCH 03/14] Increased NameLookup memory to 2G. --- helm/name-lookup/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/name-lookup/values.yaml b/helm/name-lookup/values.yaml index 8598a061..9305407f 100644 --- a/helm/name-lookup/values.yaml +++ b/helm/name-lookup/values.yaml @@ -102,7 +102,7 @@ app: memory: "512M" cpu: 500m limits: - memory: "1G" + memory: "2G" cpu: 1000m nameOverride: "" From a71e6fc77c86e352ce4687f56f239e05ecb47009 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 14 Oct 2024 23:28:33 -0400 Subject: [PATCH 04/14] Trying a startupProbe and livenessProbe on the Solr pod as well. --- helm/name-lookup/templates/solr-deployment.yaml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/helm/name-lookup/templates/solr-deployment.yaml b/helm/name-lookup/templates/solr-deployment.yaml index e2853e41..f1555b02 100644 --- a/helm/name-lookup/templates/solr-deployment.yaml +++ b/helm/name-lookup/templates/solr-deployment.yaml @@ -69,6 +69,16 @@ spec: volumeMounts: - mountPath: /var/solr/data name: {{ include "name-lookup.fullname" . }}-solr-data-vol + startupProbe: + httpGet: + port: {{ .Values.solr.port }} + path: /solr/admin/cores?action=STATUS + periodSeconds: 30 + livenessProbe: + httpGet: + port: {{ .Values.solr.port }} + path: /solr/admin/cores?action=STATUS + periodSeconds: 30 {{- with .Values.solr.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} @@ -89,4 +99,4 @@ spec: resources: requests: storage: {{ .Values.solr.storage }} - + From f7d406b70a5c672d9eb7fe612ab8e3c99b9386e9 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 14 Oct 2024 23:41:51 -0400 Subject: [PATCH 05/14] Rolling back the web deployment self-checks. --- helm/name-lookup/templates/solr-deployment.yaml | 4 ++-- helm/name-lookup/templates/web-deployment.yaml | 6 ------ 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/helm/name-lookup/templates/solr-deployment.yaml b/helm/name-lookup/templates/solr-deployment.yaml index f1555b02..ec895312 100644 --- a/helm/name-lookup/templates/solr-deployment.yaml +++ b/helm/name-lookup/templates/solr-deployment.yaml @@ -73,12 +73,12 @@ spec: httpGet: port: {{ .Values.solr.port }} path: /solr/admin/cores?action=STATUS - periodSeconds: 30 + periodSeconds: 60 livenessProbe: httpGet: port: {{ .Values.solr.port }} path: /solr/admin/cores?action=STATUS - periodSeconds: 30 + periodSeconds: 60 {{- with .Values.solr.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} diff --git a/helm/name-lookup/templates/web-deployment.yaml b/helm/name-lookup/templates/web-deployment.yaml index 9dc2005c..094d24cd 100644 --- a/helm/name-lookup/templates/web-deployment.yaml +++ b/helm/name-lookup/templates/web-deployment.yaml @@ -48,12 +48,6 @@ spec: resources: {{- toYaml . | nindent 12 }} {{- end }} - readinessProbe: - httpGet: - port: {{ .Values.webServer.port }} - path: /status - initialDelaySeconds: 120 - periodSeconds: 30 {{- with .Values.app.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} From 05d61c2da21833e2bf39c5f543e2a57afaf314d1 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 14 Oct 2024 23:45:03 -0400 Subject: [PATCH 06/14] Reducing replicaCount to 1, since this just seems to confuse thing. --- helm/name-lookup/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/name-lookup/values.yaml b/helm/name-lookup/values.yaml index 9305407f..77d733bf 100644 --- a/helm/name-lookup/values.yaml +++ b/helm/name-lookup/values.yaml @@ -5,7 +5,7 @@ webServer: - replicaCount: 3 + replicaCount: 1 service: port: 2433 type: ClusterIP From 9cbce8ed72e04cf5cde38d377a819c79aac3e73e Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 14 Oct 2024 23:45:42 -0400 Subject: [PATCH 07/14] Restored web deployment pods. --- helm/name-lookup/templates/web-deployment.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/helm/name-lookup/templates/web-deployment.yaml b/helm/name-lookup/templates/web-deployment.yaml index 094d24cd..dc41cdbd 100644 --- a/helm/name-lookup/templates/web-deployment.yaml +++ b/helm/name-lookup/templates/web-deployment.yaml @@ -48,6 +48,16 @@ spec: resources: {{- toYaml . | nindent 12 }} {{- end }} + startupProbe: + httpGet: + port: {{ .Values.webServer.port }} + path: /status + periodSeconds: 120 + readinessProbe: + httpGet: + port: {{ .Values.webServer.port }} + path: /status + periodSeconds: 120 {{- with .Values.app.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} From b4e70b145bc7a353bbe7c7833fdd647491218b2e Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 15 Oct 2024 00:01:55 -0400 Subject: [PATCH 08/14] Increased CPU on Solr pod to 6cpu, tweaked probes. --- helm/name-lookup/templates/solr-deployment.yaml | 3 ++- helm/name-lookup/templates/web-deployment.yaml | 2 +- helm/name-lookup/values.yaml | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/helm/name-lookup/templates/solr-deployment.yaml b/helm/name-lookup/templates/solr-deployment.yaml index ec895312..ae57fe19 100644 --- a/helm/name-lookup/templates/solr-deployment.yaml +++ b/helm/name-lookup/templates/solr-deployment.yaml @@ -78,7 +78,8 @@ spec: httpGet: port: {{ .Values.solr.port }} path: /solr/admin/cores?action=STATUS - periodSeconds: 60 + periodSeconds: 120 + terminationGracePeriodSeconds: 600 {{- with .Values.solr.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} diff --git a/helm/name-lookup/templates/web-deployment.yaml b/helm/name-lookup/templates/web-deployment.yaml index dc41cdbd..e66c218b 100644 --- a/helm/name-lookup/templates/web-deployment.yaml +++ b/helm/name-lookup/templates/web-deployment.yaml @@ -14,7 +14,7 @@ spec: template: metadata: labels: - {{- include "name-lookup.selectorLabels" . | nindent 8 }} + {{- include "name-lookup.labels" . | nindent 8 }} app-name: web-server spec: containers: diff --git a/helm/name-lookup/values.yaml b/helm/name-lookup/values.yaml index 77d733bf..9da47367 100644 --- a/helm/name-lookup/values.yaml +++ b/helm/name-lookup/values.yaml @@ -58,7 +58,7 @@ solr: cpu: 1000m limits: memory: "32Gi" - cpu: 4000m + cpu: 6000m # You can control the nodeSelector/affinity/tolerations settings for Solr with the following settings. # Other pods (web, restore, backup) are controlled via app.nodeSelector/affinity/tolerations below. From 43a969310e7ec5fb296d3ba7751090309c9bf0f8 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 15 Oct 2024 00:52:57 -0400 Subject: [PATCH 09/14] Increased web replica count back up to 3. --- helm/name-lookup/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/name-lookup/values.yaml b/helm/name-lookup/values.yaml index 9da47367..bc90c718 100644 --- a/helm/name-lookup/values.yaml +++ b/helm/name-lookup/values.yaml @@ -5,7 +5,7 @@ webServer: - replicaCount: 1 + replicaCount: 3 service: port: 2433 type: ClusterIP From 8730a761a94dcc76875049ab8f59f9e0f42b11af Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 15 Oct 2024 09:57:17 -0400 Subject: [PATCH 10/14] Reducing web replicas to 1. --- helm/name-lookup/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/name-lookup/values.yaml b/helm/name-lookup/values.yaml index bc90c718..9da47367 100644 --- a/helm/name-lookup/values.yaml +++ b/helm/name-lookup/values.yaml @@ -5,7 +5,7 @@ webServer: - replicaCount: 3 + replicaCount: 1 service: port: 2433 type: ClusterIP From 0ac68cb2241a09dbba974a9d6a68300e3a19cb53 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 15 Oct 2024 10:03:46 -0400 Subject: [PATCH 11/14] Tried to reduce the zkClientTimeout from 15000 to 5000. --- helm/name-lookup/templates/solr-deployment.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/helm/name-lookup/templates/solr-deployment.yaml b/helm/name-lookup/templates/solr-deployment.yaml index ae57fe19..4c5ce23d 100644 --- a/helm/name-lookup/templates/solr-deployment.yaml +++ b/helm/name-lookup/templates/solr-deployment.yaml @@ -46,6 +46,7 @@ spec: image: "{{ .Values.solr.image.repository }}:{{ .Values.solr.image.tag }}" args: - '-DzkRun' + - '-DzkClientTimeout=5000' - '-q' - '-Dlog4j2.disable.jmx=true' - '-Dlog4j2.formatMsgNoLookups=true' From 91c0cfc88e452c8506e5b4945985b2fdefd04b97 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 15 Oct 2024 10:42:36 -0400 Subject: [PATCH 12/14] Increased settings. --- helm/name-lookup/templates/solr-deployment.yaml | 2 ++ helm/name-lookup/values.yaml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/helm/name-lookup/templates/solr-deployment.yaml b/helm/name-lookup/templates/solr-deployment.yaml index 4c5ce23d..797840c0 100644 --- a/helm/name-lookup/templates/solr-deployment.yaml +++ b/helm/name-lookup/templates/solr-deployment.yaml @@ -75,11 +75,13 @@ spec: port: {{ .Values.solr.port }} path: /solr/admin/cores?action=STATUS periodSeconds: 60 + timeoutSeconds: 120 livenessProbe: httpGet: port: {{ .Values.solr.port }} path: /solr/admin/cores?action=STATUS periodSeconds: 120 + timeoutSeconds: 120 terminationGracePeriodSeconds: 600 {{- with .Values.solr.nodeSelector }} nodeSelector: diff --git a/helm/name-lookup/values.yaml b/helm/name-lookup/values.yaml index 9da47367..956c12b7 100644 --- a/helm/name-lookup/values.yaml +++ b/helm/name-lookup/values.yaml @@ -58,7 +58,7 @@ solr: cpu: 1000m limits: memory: "32Gi" - cpu: 6000m + cpu: 8000m # You can control the nodeSelector/affinity/tolerations settings for Solr with the following settings. # Other pods (web, restore, backup) are controlled via app.nodeSelector/affinity/tolerations below. From ce2e80770a882ce5ec0eee0686f92c6103407550 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 15 Oct 2024 12:23:45 -0400 Subject: [PATCH 13/14] Increase request, reduce top level CPU. --- helm/name-lookup/values.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/helm/name-lookup/values.yaml b/helm/name-lookup/values.yaml index 956c12b7..fc8a968b 100644 --- a/helm/name-lookup/values.yaml +++ b/helm/name-lookup/values.yaml @@ -55,10 +55,10 @@ solr: # requests: memory: "16Gi" - cpu: 1000m + cpu: 2000m limits: memory: "32Gi" - cpu: 8000m + cpu: 6000m # You can control the nodeSelector/affinity/tolerations settings for Solr with the following settings. # Other pods (web, restore, backup) are controlled via app.nodeSelector/affinity/tolerations below. From 9a45bcebcd33390abe8b5149ee06f7d9f6afe886 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 15 Oct 2024 14:11:29 -0400 Subject: [PATCH 14/14] Updated NameRes Dev to use the experimental bulk endpoint. --- .../name-lookup/renci-dev-values-populated.yaml | Bin 919 -> 956 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/helm/name-lookup/renci-dev-values-populated.yaml b/helm/name-lookup/renci-dev-values-populated.yaml index fcf01d236fa9ef56cc9b15ac4e0f77ea02cdafa9..a8560b87fa92d1c5df98349487cd452bda70330c 100644 GIT binary patch literal 956 zcmV;t14H}(M@dveQdv+`09{ZZX^_%Nu-cPvhi9I1ewM3x0<(iPbeWp{OAIj$a)K_y zo5V#?;OArDLT@Z;!ii^{y~=oX&+dl2xHH6ORqJ{(je@v_7A1~-$H{-T?=LxjEf5kY zFHXaQTDinnD6UKqKR-vrO+DcQwI`#I+pfr zF|C_u0h@h-Xw)$x&NOd7$kW(0v&1<09;Tu%=Se$>C*AvJd19DCFo%-W${&v1j~@2* z$aK7cF9sN!>YxlXsTQLj@_nCmL(S~ngz0Lt8^TX;Z!kZ!2tE{@V%$MBX-)jKHhAS2 zE6(qmTlP(%xf<&CK_{U@GKm>cLYZOAzgIa|LY~CNoqdRV?#z)nYQyf_AxakPC3S^5 z@$2j)&C3Pd;&HwPG2L(KcOd3_NKB4{ksAh1H~bAYm?mz2VC!&veylSKLYa`aq`ZLB z0XrBVerg|3bJL(JD%B5>1IICasJf(K6DA&7SD6bQ)O`_=_Q{=&?&9T#I(+5w`R)#? zuJwDG?D*$Snj0HT?c2xQL#q1Mr}3VnF?li<#x z%Zw}!bX06l#?ZPIa0TJiTDh!*bL0I633jSgpCOEVjCbsI9TeOcX{Z>3`K9;zSFyYKvFB+1E{9V zI5PD;FDRlh{B>H|b7f#gZe{LyV%Ed^*o(46oTX@SZ8SoJMztSu`OX0uqf|{vPqcqU zEhaj^RQZ%=9-^2CJGtv}8a+YJd_jJAlUuMxLsAUDuP-6OG{y9k0`pck55kLl8J|;v zbgUD>Fx%0!_3fU-24=bHiVPym#PGuzx7Z4WRZY@AXG@da!NmczhBaj*ynx5FpIo*B z_4v`xF^?7~eU=TwiyLJU^T1IzEM$sMbgqt~{#fP}h#t-(dh#G`wqVsFD}o;c!!^bC z>_86#O;|A!!*M@ROwcQUy5dW$D%7~J9Od%wV#(s6=>O(#2MY?&G^GWa%9W}; z%-$5t+y?!auHd;1p#>g&D?!lIb~@l3#r{@-xLz*mR$GMZPgVsD2oLy|KGVDa literal 919 zcmV;I18DpJM@dveQdv+`0RLPnVwipbkFZd^f(RE7TMVU8{6Tz)4_I`dc)xtKI(znhNwd;FRgY# z7mplt!6WdE5{1Wqvqq93xK!a#`FoD1kdOeIYvEO4<4XPBKayO$!0?x?DBeI%NOO_B znzai;3=jSlBAQ3-LRfUrrP-f46q=w-E5_S|M0(0l+cOJyXTR1b8(da;GV_Q ze^WRZm(1x~&FkX`a+X@Q!&Mhx-cJ~|%qMku22RFLo>%$z#mQ9U9?ce1xi@hRTRSRw zNrIuy)gmUZ!QRvnI-pN`gkd7|uR)96YSQ(|gu1yO#YYTAX@8m#g}7vE*IPggb708; z$E6U^u?eG&^up3TWpu?mKroq8-a<~V#EH*7s=a4T;&3w5o8f_DAUql%RKaLi&y(t$ z7~!gNg{1$9z*93XFAOFomw6D%Debj0MyxspS-y&IxZdKA)2v_n{jPVM{9$PC(I|5Q z4^2t4thKMe0=e%f^H{_=66P4)o}40*?pq)3{C{O@kUITJ+&|9{;1U@6GBAq4Sfer{ zGsKN7Uq%TpNTj2VP2PENwVy)cDWA~#3J5XQSrhS>Gf77QKyy)Tk zqBu?DK-4$%N-i6;%mqAukuteKxl2dpmu%GDlw%lux}#W$$4n9HX)+zd29R1q91O&Q z8c#1{*pC)m))kJGbBaRi%ucJ$lrK;Pk-dpqT{SKk0(byjrfJ#@pUwAs!W<5qnyJzj zO#6ttP_zKUtf_na(!VD{q4dXFMrN2Y^fl`tgW%G_!YP*^IukA!^EFcgT5LCR#i`e9 zdCd5SFw01(b`;JQW&6V($mte|LoxH_Rc;}cCEsf^ti}iqUQ33Rk~9f{3JeE3IK4q+ z;$*@x#IZ0}3vqAX0>n#m{(GCUaLx+4iZzSqOle)#N?h725GcY8B-M=hvpop;PA*Tt zy2a9WNpwoqUuRGewOB}4H{Uc6KR#-Ms+FqU-w=vWJjo40o%smNJ`0u2pW-Rv|HLvd zTYuEMqQ<{m>W;4YaANw^P@*E?&MOvPm4}_N^WZ7UaQyzjNJ4