From 26ff3658eb3e73a875ee1fb5df4376aaefd212c8 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Tue, 18 Jun 2019 18:02:45 -0400 Subject: [PATCH 01/10] Add ndt legacy configuration --- .../deployments/etl-gardener-legacy.yml | 99 +++++++++++++++++++ .../persistentvolumes/persistent-volumes.yml | 13 +++ .../services/etl-gardener-legacy-service.yml | 14 +++ 3 files changed, 126 insertions(+) create mode 100644 k8s/data-processing-cluster/deployments/etl-gardener-legacy.yml create mode 100644 k8s/data-processing-cluster/services/etl-gardener-legacy-service.yml diff --git a/k8s/data-processing-cluster/deployments/etl-gardener-legacy.yml b/k8s/data-processing-cluster/deployments/etl-gardener-legacy.yml new file mode 100644 index 00000000..a84136dc --- /dev/null +++ b/k8s/data-processing-cluster/deployments/etl-gardener-legacy.yml @@ -0,0 +1,99 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: etl-gardener-legacy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + # Used to match pre-existing pods that may be affected during updates. + run: etl-gardener-legacy + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + type: RollingUpdate + # Pod template. + template: + metadata: + labels: + # Note: run=etl-gardener-server should match a service config with a + # public IP and port so that it is publicly accessible. + run: etl-gardener-legacy + annotations: + # Tell prometheus service discovery to collect metrics from the containers. + prometheus.io/scrape: 'true' + spec: + # When container receives SIGTERM, it begins a new checkpoint. This can + # take longer than the default grace period of 30s. + terminationGracePeriodSeconds: 300 + + # Place the pod into the Guaranteed QoS by setting equal resource + # requests and limits for *all* containers in the pod. + # For more background, see: + # https://github.com/kubernetes/community/blob/master/contributors/design-proposals/resource-qos.md + containers: + - image: gcr.io/{{GCLOUD_PROJECT}}/github-m-lab-etl-gardener:{{GIT_COMMIT}} + name: etl-gardener + env: + - name: GARDENER_SERVICE + value: "true" + - name: GIT_COMMIT + value: "{{GIT_COMMIT}}" + - name: PROJECT + value: "{{GCLOUD_PROJECT}}" + # NOTE: We read archives from the public archive for all projects. + # TODO: Update when we address https://github.com/m-lab/dev-tracker/issues/369 + - name: TASKFILE_BUCKET + value: "pusher-{{GCLOUD_PROJECT}}" # This will work for sandbox/staging, but prod should use archive-measurement-lab. + - name: START_DATE + value: "20190329" + - name: DATE_SKIP # Should be 0 for normal operation + value: 0 + - name: TASK_FILE_SKIP # Should be 0 for normal operation + value: 0 + - name: EXPERIMENT + value: "ndt/legacy" + - name: DATASET + value: "batch" + - name: FINAL_DATASET + value: "base_tables" + - name: QUEUE_BASE + value: "etl-ndt-batch-" + - name: NUM_QUEUES + value: "2" + + ports: + - name: prometheus-port + containerPort: 9090 + - name: service-port + containerPort: 8080 + + livenessProbe: + httpGet: + path: /alive + port: service-port + initialDelaySeconds: 30 + periodSeconds: 60 + + resources: + requests: + memory: "3Gi" + cpu: "1" + limits: + memory: "3Gi" + cpu: "1" + + volumeMounts: + - mountPath: /volume-claim + name: legacy-storage + + nodeSelector: + gardener-node: "true" + + volumes: + - name: legacy-storage + persistentVolumeClaim: + claimName: gardener-legacy-disk0 + diff --git a/k8s/data-processing-cluster/persistentvolumes/persistent-volumes.yml b/k8s/data-processing-cluster/persistentvolumes/persistent-volumes.yml index b61409b1..36c52c92 100644 --- a/k8s/data-processing-cluster/persistentvolumes/persistent-volumes.yml +++ b/k8s/data-processing-cluster/persistentvolumes/persistent-volumes.yml @@ -49,3 +49,16 @@ spec: resources: requests: storage: 10Gi +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: gardener-legacy-disk0 + annotations: + volume.beta.kubernetes.io/storage-class: "slow" +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi diff --git a/k8s/data-processing-cluster/services/etl-gardener-legacy-service.yml b/k8s/data-processing-cluster/services/etl-gardener-legacy-service.yml new file mode 100644 index 00000000..92b78111 --- /dev/null +++ b/k8s/data-processing-cluster/services/etl-gardener-legacy-service.yml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: etl-gardener-legacy-service + namespace: default +spec: + ports: + - port: 8080 + protocol: TCP + targetPort: 8080 + selector: + run: etl-gardener-legacy + sessionAffinity: None + type: LoadBalancer From 90fb0e6e560dd0a8cb58c990aa52175e5e5e936a Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Tue, 18 Jun 2019 18:17:18 -0400 Subject: [PATCH 02/10] Values are strings --- .../deployments/etl-gardener-legacy.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/k8s/data-processing-cluster/deployments/etl-gardener-legacy.yml b/k8s/data-processing-cluster/deployments/etl-gardener-legacy.yml index a84136dc..64ac5a9d 100644 --- a/k8s/data-processing-cluster/deployments/etl-gardener-legacy.yml +++ b/k8s/data-processing-cluster/deployments/etl-gardener-legacy.yml @@ -50,9 +50,9 @@ spec: - name: START_DATE value: "20190329" - name: DATE_SKIP # Should be 0 for normal operation - value: 0 + value: "0" - name: TASK_FILE_SKIP # Should be 0 for normal operation - value: 0 + value: "0" - name: EXPERIMENT value: "ndt/legacy" - name: DATASET From 9151c56fd3f28b483ba35068d4e9f5835b7ec8ff Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Tue, 18 Jun 2019 18:18:16 -0400 Subject: [PATCH 03/10] Meaningful start date --- k8s/data-processing-cluster/deployments/etl-gardener-legacy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/data-processing-cluster/deployments/etl-gardener-legacy.yml b/k8s/data-processing-cluster/deployments/etl-gardener-legacy.yml index 64ac5a9d..df49c8be 100644 --- a/k8s/data-processing-cluster/deployments/etl-gardener-legacy.yml +++ b/k8s/data-processing-cluster/deployments/etl-gardener-legacy.yml @@ -48,7 +48,7 @@ spec: - name: TASKFILE_BUCKET value: "pusher-{{GCLOUD_PROJECT}}" # This will work for sandbox/staging, but prod should use archive-measurement-lab. - name: START_DATE - value: "20190329" + value: "20190513" - name: DATE_SKIP # Should be 0 for normal operation value: "0" - name: TASK_FILE_SKIP # Should be 0 for normal operation From c1d7009a4f3b573893ec4bee8335d1464e324542 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Tue, 18 Jun 2019 18:27:15 -0400 Subject: [PATCH 04/10] Add dedup query for legacy table --- cloud/bq/dedup.go | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/cloud/bq/dedup.go b/cloud/bq/dedup.go index 2476c86a..5c4ee78e 100644 --- a/cloud/bq/dedup.go +++ b/cloud/bq/dedup.go @@ -201,6 +201,22 @@ var dedupTemplateTCPInfo = ` ) WHERE row_number = 1` +var dedupTemplateNDTLegacy = ` + #standardSQL + SELECT + * EXCEPT (row_number) + FROM ( + SELECT + *, ROW_NUMBER() OVER ( + PARTITION BY CONCAT(test_id) + # Use the most recently parsed row + ORDER BY parse_time DESC + ) AS row_number + FROM ` + "`%s`" + ` + ) + WHERE + row_number = 1` + // Dedup executes a query that dedups and writes to destination partition. // This function is alpha status. The interface may change without notice // or major version number change. @@ -234,6 +250,8 @@ func Dedup(ctx context.Context, dsExt *dataset.Dataset, src string, destTable bq queryString = fmt.Sprintf(dedupTemplateTraceroute, src) case strings.HasPrefix(destTable.TableID(), "tcpinfo"): queryString = fmt.Sprintf(dedupTemplateTCPInfo, src) + case strings.HasPrefix(destTable.TableID(), "legacy"): + queryString = fmt.Sprintf(dedupTemplateNDTLegacy, src) default: log.Println("Only handles sidestream, ndt, switch, traceroute, not " + destTable.TableID()) return nil, errors.New("Unknown table type") From 9a5b311cd5651ad327646d921d38c7aef1cb1d27 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Tue, 18 Jun 2019 18:42:56 -0400 Subject: [PATCH 05/10] Update dedup query to use ParseInfo.ParseTime --- cloud/bq/dedup.go | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/cloud/bq/dedup.go b/cloud/bq/dedup.go index 5c4ee78e..65e4b58e 100644 --- a/cloud/bq/dedup.go +++ b/cloud/bq/dedup.go @@ -203,15 +203,9 @@ var dedupTemplateTCPInfo = ` var dedupTemplateNDTLegacy = ` #standardSQL - SELECT - * EXCEPT (row_number) + SELECT * EXCEPT (row_number) FROM ( - SELECT - *, ROW_NUMBER() OVER ( - PARTITION BY CONCAT(test_id) - # Use the most recently parsed row - ORDER BY parse_time DESC - ) AS row_number + SELECT *, ROW_NUMBER() OVER (PARTITION BY CONCAT(test_id) ORDER BY ParseInfo.ParseTime DESC) AS row_number FROM ` + "`%s`" + ` ) WHERE From 31ebdb92dd6b6b9bccfc68cd9ed06b7d1ecfb2a4 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Tue, 18 Jun 2019 18:48:53 -0400 Subject: [PATCH 06/10] Update sanity check for ndtlegacy data --- cloud/bq/sanity.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cloud/bq/sanity.go b/cloud/bq/sanity.go index c96fba16..a50a7f2a 100644 --- a/cloud/bq/sanity.go +++ b/cloud/bq/sanity.go @@ -176,10 +176,19 @@ func GetTableDetail(ctx context.Context, dsExt *dataset.Dataset, table bqiface.T %s -- where clause`, dataset, tableName, where) + legacyNDTQuery := fmt.Sprintf(` + #standardSQL + SELECT COUNT(DISTINCT test_id) AS TestCount, COUNT(DISTINCT ParseInfo.TaskFileName) AS TaskFileCount + FROM `+"`%s.%s`"+` + %s -- where clause`, + dataset, tableName, where) + // TODO - find a better way to do this. query := legacyQuery if parts[0] == "tcpinfo" { query = tcpinfoQuery + } else if parts[0] == "legacy" { + query = legacyNDTQuery } err := dsExt.QueryAndParse(ctx, query, &detail) if err != nil { From 862b4d3faff29899fba362e4f79c5b89ff5e960a Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Tue, 18 Jun 2019 18:53:25 -0400 Subject: [PATCH 07/10] Add link to open issue --- cloud/bq/sanity.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cloud/bq/sanity.go b/cloud/bq/sanity.go index a50a7f2a..d82aefd1 100644 --- a/cloud/bq/sanity.go +++ b/cloud/bq/sanity.go @@ -184,6 +184,7 @@ func GetTableDetail(ctx context.Context, dsExt *dataset.Dataset, table bqiface.T dataset, tableName, where) // TODO - find a better way to do this. + // https://github.com/m-lab/etl-gardener/issues/158 query := legacyQuery if parts[0] == "tcpinfo" { query = tcpinfoQuery From 66e1d95836fc2b1ef4ea53241816bfcb1d90d7ed Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Tue, 18 Jun 2019 18:55:28 -0400 Subject: [PATCH 08/10] Remove redundant CONCAT --- cloud/bq/dedup.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/bq/dedup.go b/cloud/bq/dedup.go index 65e4b58e..2b5cf350 100644 --- a/cloud/bq/dedup.go +++ b/cloud/bq/dedup.go @@ -205,7 +205,7 @@ var dedupTemplateNDTLegacy = ` #standardSQL SELECT * EXCEPT (row_number) FROM ( - SELECT *, ROW_NUMBER() OVER (PARTITION BY CONCAT(test_id) ORDER BY ParseInfo.ParseTime DESC) AS row_number + SELECT *, ROW_NUMBER() OVER (PARTITION BY test_id ORDER BY ParseInfo.ParseTime DESC) AS row_number FROM ` + "`%s`" + ` ) WHERE From de9c99b40dc40abcdb0281ca7a04cb0b7254be3c Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Tue, 18 Jun 2019 19:17:21 -0400 Subject: [PATCH 09/10] Use legacy specific queues --- k8s/data-processing-cluster/deployments/etl-gardener-legacy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/data-processing-cluster/deployments/etl-gardener-legacy.yml b/k8s/data-processing-cluster/deployments/etl-gardener-legacy.yml index df49c8be..2e912127 100644 --- a/k8s/data-processing-cluster/deployments/etl-gardener-legacy.yml +++ b/k8s/data-processing-cluster/deployments/etl-gardener-legacy.yml @@ -60,7 +60,7 @@ spec: - name: FINAL_DATASET value: "base_tables" - name: QUEUE_BASE - value: "etl-ndt-batch-" + value: "etl-legacy-batch-" - name: NUM_QUEUES value: "2" From 42967597c9773e68194755d1eaf1be4abbfafa4b Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Tue, 18 Jun 2019 21:03:40 -0400 Subject: [PATCH 10/10] Remove whitespace --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 554e3f02..585005b6 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,6 @@ [![Waffle.io](https://badge.waffle.io/m-lab/etl-gardener.svg?title=Ready)](http://waffle.io/m-lab/etl-gardener) - ## Gardener provides services for maintaining and reprocessing mlab data. ## Unit Testing