diff --git a/.github/workflows/deploy-config.json b/.github/workflows/deploy-config.json index e5d5c7502..0a7576efd 100644 --- a/.github/workflows/deploy-config.json +++ b/.github/workflows/deploy-config.json @@ -53,10 +53,5 @@ "dir_name": "code_generator_funcadl_xAOD", "image_name": "servicex_code_gen_atlas_xaod", "test_required": true - }, - { - "dir_name": "minio_cleanup", - "image_name": "servicex_minio_cleanup", - "test_required": false } ] diff --git a/docs/deployment/reference.md b/docs/deployment/reference.md index 6b8805463..26dcd1804 100644 --- a/docs/deployment/reference.md +++ b/docs/deployment/reference.md @@ -6,113 +6,109 @@ The following table lists the configurable parameters of the ServiceX chart and their default values. Note that you may also wish to change some of the default parameters for the [rabbitMQ](https://github.com/bitnami/charts/tree/master/bitnami/rabbitmq) or [minio](https://github.com/minio/charts) subcharts. -| Parameter | Description | Default | -|--------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------| -| `secrets` | Name of a secret deployed into the cluster. Must follow example_secrets.yaml | - | -| `logging.logstash.enabled` | Enable remote logging | true | -| `logging.logstash.host` | Host running logstash listening for log data | `servicex.atlas-ml.org` | -| `logging.logstash.port` | Port to send logging to | 5959 | -| `logging.logstash.protocol` | Protocol to be used (options are TCP and UDP) | TCP | -| `logging.logstash.monitor` | Link to be shown inside Monitor web page iframe | UC Kibana dashboard | -| `logging.logstash.logs` | Link to be shown inside Logs web page iframe | UC Kibana dashboard | -| `app.image` | ServiceX_App image name | `sslhep/servicex_app` | -| `app.tag` | ServiceX image tag | `latest` | -| `app.logLevel` | Logging level for ServiceX web app (uses standard unix levels) | `WARNING` | -| `app.pullPolicy` | ServiceX image pull policy | `Always` | -| `app.checksImage` | ServiceX init container image for checks | `ncsa/checks:latest` | -| `app.rabbitmq.retries` | Number of times to retry connecting to RabbitMQ on startup | 12 | -| `app.rabbitmq.retry_interval` | Number of seconds to wait between RabbitMQ retries on startup | 10 | -| `app.replicas` | Number of App pods to start. Experimental! | 1 | -| `app.auth` | Enable authentication or allow unfettered access (Python boolean string) | `false` | -| `app.globusClientID` | Globus application Client ID | - | -| `app.globusClientSecret` | Globus application Client Secret | - | -| `app.adminEmail` | Email address for initial admin user | | -| `app.tokenExpires` | Seconds until the ServiceX API tokens (JWT refresh tokens) expire | False (never) | -| `app.authExpires` | Seconds until the JWT access tokens expire | 21600 (six hours) | -| `app.ingress.enabled` | Enable install of ingress | false | -| `app.ingress.class` | Class to be set in `kubernetes.io/ingress.class` annotation | nginx | -| `app.ingress.host` | Hostname to associate ingress with | servicex.ssl-hep.org | -| `app.ingress.defaultBackend` | Name of a service to send requests to internal endpoints to | default-http-backend | -| `app.ingress.tls.enabled` | Enable TLS for ServiceX API Ingress resource | false | -| `app.ingress.tls.secretName` | Name of TLS Secret used for ServiceX API server | `{{.Release.Name}}-app-tls` | -| `app.ingress.tls.clusterIssuer` | Specify a ClusterIssuer if using cert-manager | - | -| `app.resources` | Pass in Kubernetes pod resource spec to deployment to change CPU and memory | { } | -| `app.slackSigningSecret` | Signing secret for Slack application | - | -| `app.newSignupWebhook` | Slack webhook URL for new signups | - | -| `app.mailgunApiKey` | API key to send Mailgun emails to newly approved users | - | -| `app.mailgunDomain` | Sender domain for emails (should be verified through Mailgun) | - | -| `app.defaultDIDFinderScheme` | DID Finder scheme if none provided in request. If left blank, template will attempt to guess. | - | -| `app.validateTransformerImage` | Should docker image name be validated at DockerHub? | `true` | - | `app.defaultUsers` | Name of secret holding json file with default users to create on deployment | - | -| `app.sqlalchemyEngineOptions` | String that will be interpreted as a Python dict, giving keyword arguments to the `sqlalchemy.create_engine()` function for the internal database. | - | -| `didFinder.rucio.enabled` | Should we deploy the Rucio DID Finder? | `true` | -| `didFinder.rucio.image` | Rucio DID Finder image name | `sslhep/servicex-did-finder` | -| `didFinder.rucio.tag` | Rucio DID Finder image tag | `latest` | -| `didFinder.rucio.pullPolicy` | Rucio DID Finder image pull policy | `Always` | -| `didFinder.rucio.site` | Site name to provide to Rucio to determine input replica locality | - | -| `didFinder.rucio.servicex_latitude` | Latitude of the computing center where ServiceX runs. Will be used by Rucio to return the closest input data replica. | 41.78 | -| `didFinder.rucio.servicex_longitude` | Longitude of the computing center where ServiceX runs. Will be used by Rucio to return the closest input data replica. | -87.7 | -| `didFinder.rucio.reportLogicalFiles` | For CMS xCache sites, we don't want the replicas, only logical names. Set to true to get this behavior | false | -| `didFinder.rucio.rucio_host` | URL for Rucio service to use | `https://voatlasrucio-server-prod.cern.ch:443` | -| `didFinder.rucio.auth_host` | URL to obtain Rucio authentication | `https://atlas-rucio-auth.cern.ch:443` | -| `didFinder.CERNOpenData.enabled` | Should we deploy the CERN OpenData DID Finder? |`true` | -| `didFinder.CERNOpenData.image` | CERN OpenData DID Finder image name | `sslhep/servicex-did-finder` | -| `didFinder.CERNOpenData.tag` | CERN OpenData DID Finder image tag | `latest` | -| `didFinder.CERNOpenData.pullPolicy` | CERN OpenData DID Finder image pull policy | `Always` | -| `didFinder.xrootd.enabled` | Should we deploy the XRootD DID Finder? | true | -| `didFinder.xrootd.image` | XRootD DID Finder image name | `sslhep/servicex-did-finder-xrootd` | -| `didFinder.xrootd.tag` | XRootD DID Finder image tag | [chart release] | -| `didFinder.xrootd.pullPolicy` | XRootD DID Finder image pull policy | `Always` | -| `codegen.uproot.enabled` | Deploy the FuncADL Uproot Code generator? | true | -| `codegen.uproot.image` | Code generator image | `sslhep/servicex_code_gen_func_adl_xaod` | -| `codegen.uproot.pullPolicy` | Uproot code generator image pull policy | true | -| `codegen.uproot.tag` | Code generator image tag | develop | -| `codegen.uproot.defaultScienceContainerImage` | The transformer image that should be run against this generated code | `sslhep/servicex_func_adl_xaod_transformer` | -| `codegen.uproot.defaultScienceContainerTag` | Tag for the transformer image that should be run against this generated code | develop | -|`codegen.uproot.enabled` | Deploy the uproot-raw (non-FuncADL) code generator? - also all of the code gen settings above are available | true | -|`codegen.cmssw-5-3-32.enabled` | Deploy the CMS AOD code generator? - also all of the code gen settings above are available | true | -|`codegen.atlasr21.enabled` | Deploy the ATLAS FuncADL Release 21 code generator? - also all of the code gen settings above are available | true | -|`codegen.atlasr22.enabled` | Deploy the ATLAS FuncADL Release 22 code generator? - also all of the code gen settings above are available | true | -|`codegen.python.enabled` | Deploy the python uproot code generator? - also all of the code gen settings, above are available | true | -| `x509Secrets.image` | X509 Secret Service image name | `sslhep/x509-secrets` | -| `x509Secrets.tag` | X509 Secret Service image tag | `latest` | -| `x509Secrets.pullPolicy` | X509 Secret Service image pull policy | `Always` | -| `x509Secrets.vomsOrg` | Which VOMS org to contact for proxy? | `atlas` | -| `x509Secrets.initImage` | X509 Secret Service init container image | `alpine:3.6` | -| `rbacEnabled` | Specify if rbac is enabled in your cluster | `true` | -| `hostMount` | Optional path to mount in transformers as /data | - | -| `gridAccount` | CERN User account name to access Rucio | - | -| `noCerts` | Set to true to disable x509 certs and only use open data | false | -| `rabbitmq.password` | Override the generated RabbitMQ password | leftfoot1 | -| `objectStore.enabled` | Deploy a minio object store with Servicex? | true | -| `objectStore.internal` | Deploy a captive minio instance with this chart? | true | -| `objectStore.publicURL` | What URL should the client use to download files? If set, this is given whether ingress is enabled or not | nil | -| `postgres.enabled` | Deploy a postgres database into cluster? If not, we use a sqllite db | false | -| `minio.auth.rootUser` | Username to log into minio | miniouser | -| `minio.auth.rootPassword` | Password key to log into minio | leftfoot1 | -| `minio.apiIngress.enabled` | Should minio chart deploy an ingress to the service? | false | -| `minio.apiIngress.hostname` | Hostname associate with ingress controller | nil | -| `transformer.cachePrefix` | Prefix string to stick in front of file paths. Useful for XCache | | -| `transformer.autoscaler.enabled` | Enable/disable horizontal pod autoscaler for transformers | True | -| `transformer.autoscaler.cpuScaleThreshold` | CPU percentage threshold for pod scaling | 30 | -| `transformer.autoscaler.minReplicas` | Minimum number of transformer pods per request | 1 | -| `transformer.autoscaler.maxReplicas` | Maximum number of transformer pods per request | 20 | -| `transformer.pullPolicy` | Pull policy for transformer pods (Image name specified in REST Request) | Always | -| `transformer.priorityClassName` | priorityClassName for transformer pods (Not setting it means getting global default) | Not Set | -| `transformer.cpuLimit` | Set CPU resource limit for pod in number of cores | 1 | -| `transformer.sidecarImage` | Image name for the transformer sidecar container that hold the serviceX code | 'sslhep/servicex_sidecar_transformer' | -| `transformer.sidecarTag` | Tag for the sidecar container | 'develop' | -| `transformer.sidecarPullPolicy` | Pull Policy for the sidecar container | 'Always' | -| `transformer.persistence.existingClaim` | Existing persistent volume claim | nil | -| `transformer.subdir` | Subdirectory of the mount to write transformer results to (should end with trailing /) | nil | -| `minioCleanup.enabled` | Enable deployment of minio cleanup service | false | -| `minioCleanup.image` | Default image for minioCleanup cronjob | `sslhep/servicex_minio_cleanup` | -| `minioCleanup.tag` | minioCleanup image tag | | -| `minioCleanup.pullPolicy` | minioCleanup image pull policy | `Always` | -| `minioCleanup.threads` | Number of threads to use when processing S3 Storage | 6 | -| `minioCleanup.logLevel` | Log level to use for logging (e.g. DEBUG, INFO, WARN, ERROR, FATAL) | INFO | -| `minioCleanup.schedule` | Schedule for minioCleanup cronjob. See [reference](https://kubernetes.io/docs/concepts/workloads/controllers/cron-jobs/#cron-schedule-syntax) for details on fields | `* */8 * * *` (every 8 hours) | -| `minioCleanup.maxAge` | Max age in days before removing results | 30 | -| `minioCleanup.maxSize` | Start removing buckets when total space used reaches this number (can use G,M, T suffixes) | '1G' | -| `minioCleanup.normSize` | Size at which to stop removing buckets | '700M' | | +| Parameter | Description | Default | +|-----------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------| +| `secrets` | Name of a secret deployed into the cluster. Must follow example_secrets.yaml | - | +| `logging.logstash.enabled` | Enable remote logging | true | +| `logging.logstash.host` | Host running logstash listening for log data | `servicex.atlas-ml.org` | +| `logging.logstash.port` | Port to send logging to | 5959 | +| `logging.logstash.protocol` | Protocol to be used (options are TCP and UDP) | TCP | +| `logging.logstash.monitor` | Link to be shown inside Monitor web page iframe | UC Kibana dashboard | +| `logging.logstash.logs` | Link to be shown inside Logs web page iframe | UC Kibana dashboard | +| `app.image` | ServiceX_App image name | `sslhep/servicex_app` | +| `app.tag` | ServiceX image tag | `latest` | +| `app.logLevel` | Logging level for ServiceX web app (uses standard unix levels) | `WARNING` | +| `app.pullPolicy` | ServiceX image pull policy | `Always` | +| `app.checksImage` | ServiceX init container image for checks | `ncsa/checks:latest` | +| `app.rabbitmq.retries` | Number of times to retry connecting to RabbitMQ on startup | 12 | +| `app.rabbitmq.retry_interval` | Number of seconds to wait between RabbitMQ retries on startup | 10 | +| `app.replicas` | Number of App pods to start. Experimental! | 1 | +| `app.auth` | Enable authentication or allow unfettered access (Python boolean string) | `false` | +| `app.globusClientID` | Globus application Client ID | - | +| `app.globusClientSecret` | Globus application Client Secret | - | +| `app.adminEmail` | Email address for initial admin user | | +| `app.tokenExpires` | Seconds until the ServiceX API tokens (JWT refresh tokens) expire | False (never) | +| `app.authExpires` | Seconds until the JWT access tokens expire | 21600 (six hours) | +| `app.ingress.enabled` | Enable install of ingress | false | +| `app.ingress.class` | Class to be set in `kubernetes.io/ingress.class` annotation | nginx | +| `app.ingress.host` | Hostname to associate ingress with | servicex.ssl-hep.org | +| `app.ingress.defaultBackend` | Name of a service to send requests to internal endpoints to | default-http-backend | +| `app.ingress.tls.enabled` | Enable TLS for ServiceX API Ingress resource | false | +| `app.ingress.tls.secretName` | Name of TLS Secret used for ServiceX API server | `{{.Release.Name}}-app-tls` | +| `app.ingress.tls.clusterIssuer` | Specify a ClusterIssuer if using cert-manager | - | +| `app.resources` | Pass in Kubernetes pod resource spec to deployment to change CPU and memory | { } | +| `app.slackSigningSecret` | Signing secret for Slack application | - | +| `app.newSignupWebhook` | Slack webhook URL for new signups | - | +| `app.mailgunApiKey` | API key to send Mailgun emails to newly approved users | - | +| `app.mailgunDomain` | Sender domain for emails (should be verified through Mailgun) | - | +| `app.defaultDIDFinderScheme` | DID Finder scheme if none provided in request. If left blank, template will attempt to guess. | - | +| `app.validateTransformerImage` | Should docker image name be validated at DockerHub? | `true` | + | `app.defaultUsers` | Name of secret holding json file with default users to create on deployment | - | +| `app.sqlalchemyEngineOptions` | String that will be interpreted as a Python dict, giving keyword arguments to the `sqlalchemy.create_engine()` function for the internal database. | - | +| `didFinder.rucio.enabled` | Should we deploy the Rucio DID Finder? | `true` | +| `didFinder.rucio.image` | Rucio DID Finder image name | `sslhep/servicex-did-finder` | +| `didFinder.rucio.tag` | Rucio DID Finder image tag | `latest` | +| `didFinder.rucio.pullPolicy` | Rucio DID Finder image pull policy | `Always` | +| `didFinder.rucio.site` | Site name to provide to Rucio to determine input replica locality | - | +| `didFinder.rucio.servicex_latitude` | Latitude of the computing center where ServiceX runs. Will be used by Rucio to return the closest input data replica. | 41.78 | +| `didFinder.rucio.servicex_longitude` | Longitude of the computing center where ServiceX runs. Will be used by Rucio to return the closest input data replica. | -87.7 | +| `didFinder.rucio.reportLogicalFiles` | For CMS xCache sites, we don't want the replicas, only logical names. Set to true to get this behavior | false | +| `didFinder.rucio.rucio_host` | URL for Rucio service to use | `https://voatlasrucio-server-prod.cern.ch:443` | +| `didFinder.rucio.auth_host` | URL to obtain Rucio authentication | `https://atlas-rucio-auth.cern.ch:443` | +| `didFinder.CERNOpenData.enabled` | Should we deploy the CERN OpenData DID Finder? | `true` | +| `didFinder.CERNOpenData.image` | CERN OpenData DID Finder image name | `sslhep/servicex-did-finder` | +| `didFinder.CERNOpenData.tag` | CERN OpenData DID Finder image tag | `latest` | +| `didFinder.CERNOpenData.pullPolicy` | CERN OpenData DID Finder image pull policy | `Always` | +| `didFinder.xrootd.enabled` | Should we deploy the XRootD DID Finder? | true | +| `didFinder.xrootd.image` | XRootD DID Finder image name | `sslhep/servicex-did-finder-xrootd` | +| `didFinder.xrootd.tag` | XRootD DID Finder image tag | [chart release] | +| `didFinder.xrootd.pullPolicy` | XRootD DID Finder image pull policy | `Always` | +| `codegen.uproot.enabled` | Deploy the FuncADL Uproot Code generator? | true | +| `codegen.uproot.image` | Code generator image | `sslhep/servicex_code_gen_func_adl_xaod` | +| `codegen.uproot.pullPolicy` | Uproot code generator image pull policy | true | +| `codegen.uproot.tag` | Code generator image tag | develop | +| `codegen.uproot.defaultScienceContainerImage` | The transformer image that should be run against this generated code | `sslhep/servicex_func_adl_xaod_transformer` | +| `codegen.uproot.defaultScienceContainerTag` | Tag for the transformer image that should be run against this generated code | develop | +| `codegen.uproot.enabled` | Deploy the uproot-raw (non-FuncADL) code generator? - also all of the code gen settings above are available | true | +| `codegen.cmssw-5-3-32.enabled` | Deploy the CMS AOD code generator? - also all of the code gen settings above are available | true | +| `codegen.atlasr21.enabled` | Deploy the ATLAS FuncADL Release 21 code generator? - also all of the code gen settings above are available | true | +| `codegen.atlasr22.enabled` | Deploy the ATLAS FuncADL Release 22 code generator? - also all of the code gen settings above are available | true | +| `codegen.python.enabled` | Deploy the python uproot code generator? - also all of the code gen settings, above are available | true | +| `x509Secrets.image` | X509 Secret Service image name | `sslhep/x509-secrets` | +| `x509Secrets.tag` | X509 Secret Service image tag | `latest` | +| `x509Secrets.pullPolicy` | X509 Secret Service image pull policy | `Always` | +| `x509Secrets.vomsOrg` | Which VOMS org to contact for proxy? | `atlas` | +| `x509Secrets.initImage` | X509 Secret Service init container image | `alpine:3.6` | +| `rbacEnabled` | Specify if rbac is enabled in your cluster | `true` | +| `hostMount` | Optional path to mount in transformers as /data | - | +| `gridAccount` | CERN User account name to access Rucio | - | +| `noCerts` | Set to true to disable x509 certs and only use open data | false | +| `rabbitmq.password` | Override the generated RabbitMQ password | leftfoot1 | +| `objectStore.enabled` | Deploy a minio object store with Servicex? | true | +| `objectStore.internal` | Deploy a captive minio instance with this chart? | true | +| `objectStore.publicURL` | What URL should the client use to download files? If set, this is given whether ingress is enabled or not | nil | +| `postgres.enabled` | Deploy a postgres database into cluster? If not, we use a sqllite db | false | +| `minio.auth.rootUser` | Username to log into minio | miniouser | +| `minio.auth.rootPassword` | Password key to log into minio | leftfoot1 | +| `minio.apiIngress.enabled` | Should minio chart deploy an ingress to the service? | false | +| `minio.apiIngress.hostname` | Hostname associate with ingress controller | nil | +| `transformer.cachePrefix` | Prefix string to stick in front of file paths. Useful for XCache | | +| `transformer.autoscaler.enabled` | Enable/disable horizontal pod autoscaler for transformers | True | +| `transformer.autoscaler.cpuScaleThreshold` | CPU percentage threshold for pod scaling | 30 | +| `transformer.autoscaler.minReplicas` | Minimum number of transformer pods per request | 1 | +| `transformer.autoscaler.maxReplicas` | Maximum number of transformer pods per request | 20 | +| `transformer.pullPolicy` | Pull policy for transformer pods (Image name specified in REST Request) | Always | +| `transformer.priorityClassName` | priorityClassName for transformer pods (Not setting it means getting global default) | Not Set | +| `transformer.cpuLimit` | Set CPU resource limit for pod in number of cores | 1 | +| `transformer.sidecarImage` | Image name for the transformer sidecar container that hold the serviceX code | 'sslhep/servicex_sidecar_transformer' | +| `transformer.sidecarTag` | Tag for the sidecar container | 'develop' | +| `transformer.sidecarPullPolicy` | Pull Policy for the sidecar container | 'Always' | +| `transformer.persistence.existingClaim` | Existing persistent volume claim | nil | +| `transformer.subdir` | Subdirectory of the mount to write transformer results to (should end with trailing /) | nil | +| `dataLifecycle.enabled` | Enable deployment of data lifecycle jobs | false | +| `dataLifecycle.image` | Default image for data lifecycle job | `sslhep/servicex_minio_cleanup` | +| `dataLifecycle.tag` | Data lifecycle job image tag | | +| `dataLifecycle.pullPolicy` | Data lifecycle image pull policy | `Always` | +| `dataLifecycle.schedule` | Schedule for minioCleanup cronjob. See [reference](https://kubernetes.io/docs/concepts/workloads/controllers/cron-jobs/#cron-schedule-syntax) for details on fields | `* */8 * * *` (every 8 hours) | +| `dataLifecycle.retention` | We will archive any transforms older than this. Use the gnu date command --date argument. See [date command](https://www.geeksforgeeks.org/date-command-linux-examples/#4-how-to-display-past-dates)for examples | 7 days ago | diff --git a/helm/servicex/templates/data-lifecycle/cronjob.yaml b/helm/servicex/templates/data-lifecycle/cronjob.yaml new file mode 100644 index 000000000..567ff2e75 --- /dev/null +++ b/helm/servicex/templates/data-lifecycle/cronjob.yaml @@ -0,0 +1,32 @@ +{{- if .Values.dataLifecycle.enabled -}} +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ .Release.Name }}-data-lifecycle-job +spec: + schedule: {{ .Values.dataLifecycle.schedule | default "* */8 * * *" | quote }} + concurrencyPolicy: "Forbid" + jobTemplate: + spec: + template: + metadata: + labels: + app: {{ .Release.Name }}-data-lifecycle-job + spec: + containers: + - name: {{ .Release.Name }}-data-lifecycle-job + image: {{ .Values.dataLifecycle.image }}:{{ .Values.dataLifecycle.tag }} + imagePullPolicy: {{ .Values.dataLifecycle.pullPolicy }} + env: + - name: RETENTION_PERIOD + value: {{ .Values.dataLifecycle.retention }} + + command: + - /bin/sh + - -c + - | + CUTOFF_TIMESTAMP=$(date -d "$RETENTION_PERIOD" +%Y-%m-%dT%H:%M:%S) && + echo "Request data lifecycle with cutoff timestamp: $CUTOFF_TIMESTAMP" && + curl --request POST "http://{{ .Release.Name }}-servicex-app:8000/servicex/internal/data-lifecycle?cutoff_timestamp=$CUTOFF_TIMESTAMP" + restartPolicy: OnFailure +{{- end }} diff --git a/helm/servicex/templates/minio-cleanup/cronjob.yaml b/helm/servicex/templates/minio-cleanup/cronjob.yaml deleted file mode 100644 index 115881284..000000000 --- a/helm/servicex/templates/minio-cleanup/cronjob.yaml +++ /dev/null @@ -1,72 +0,0 @@ -{{- if .Values.minioCleanup.enabled -}} -apiVersion: batch/v1 -kind: CronJob -metadata: - name: {{ .Release.Name }}-minio-cleanup -spec: - schedule: {{ .Values.minioCleanup.schedule | default "* */8 * * *" | quote }} - concurrencyPolicy: "Forbid" - jobTemplate: - spec: - template: - metadata: - labels: - app: {{ .Release.Name }}-minio-cleanup - spec: - serviceAccountName: {{ template "servicex.fullname" . }} - containers: - - name: {{ .Release.Name }}-minio-cleanup - image: {{ .Values.minioCleanup.image }}:{{ .Values.minioCleanup.tag }} - imagePullPolicy: {{ .Values.minioCleanup.pullPolicy }} - env: - - name: INSTANCE_NAME - value: "{{ .Release.Name }}" - - name: MAX_SIZE - value: "{{ .Values.minioCleanup.maxSize }}" - - name: NORM_SIZE - value: "{{ .Values.minioCleanup.normSize }}" - - name: MAX_AGE - value: "{{ default 30 .Values.minioCleanup.maxAge }}" - - name: MINIO_ENCRYPT - value: {{ .Values.objectStore.useTLS | ternary "true" "false" | quote }} - {{ if not .Values.objectStore.internal }} - # using external minio - - name: MINIO_URL - value: {{ .Values.objectStore.publicURL }} - {{ else }} - - name: MINIO_URL - value: {{ .Release.Name }}-minio:{{ .Values.minio.service.ports.api }} - {{- end }} - {{ if .Values.minioCleanup.threads }} - - name: THREADS - value: "{{ .Values.minioCleanup.threads }}" - {{ end }} - {{ if .Values.minioCleanup.logLevel }} - - name: LOG_LEVEL - value: {{ .Values.minioCleanup.logLevel }} - {{ end }} - {{ if .Values.secrets }} - - name: ACCESS_KEY - valueFrom: - secretKeyRef: - name: {{ .Values.secrets }} - key: accesskey - - name: SECRET_KEY - valueFrom: - secretKeyRef: - name: {{ .Values.secrets }} - key: secretkey - {{ else }} - - name: ACCESS_KEY - value: {{ .Values.minio.auth.rootUser }} - - name: SECRET_KEY - value: {{ .Values.minio.auth.rootPassword }} - {{- end }} - {{- if .Values.logging.logstash.enabled }} - - name: LOGSTASH_HOST - value: "{{ .Values.logging.logstash.host }}" - - name: LOGSTASH_PORT - value: "{{ .Values.logging.logstash.port }}" - {{- end }} - restartPolicy: OnFailure -{{- end }} diff --git a/helm/servicex/values.yaml b/helm/servicex/values.yaml index 691fa23c9..9b96c4890 100644 --- a/helm/servicex/values.yaml +++ b/helm/servicex/values.yaml @@ -188,14 +188,18 @@ x509Secrets: pullPolicy: Always tag: develop vomsOrg: atlas -minioCleanup: + +dataLifecycle: enabled: false - image: sslhep/servicex_minio_cleanup - threads: "6" - logLevel: "INFO" + # This image should be a fully functional linux image with gnu date installed + # ideally it should be an image we have already pulled + image: python + tag: "3.10" pullPolicy: Always - tag: develop schedule: "* */8 * * *" - maxAge: 30 - maxSize: "1G" - normSize: "700M" + + # We will archive any transforms older than this. Use the gnu date command + # --date argument. + # See https://www.geeksforgeeks.org/date-command-linux-examples/#4-how-to-display-past-dates + # for examples + retention: "7 days ago" diff --git a/minio_cleanup/Dockerfile b/minio_cleanup/Dockerfile deleted file mode 100644 index df3f7c4fa..000000000 --- a/minio_cleanup/Dockerfile +++ /dev/null @@ -1,29 +0,0 @@ -FROM python:3.11 AS builder - -RUN useradd -ms /bin/bash cleanup - -COPY pyproject.toml poetry.lock /home/cleanup/ -WORKDIR /home/cleanup - -FROM builder as poetry -ENV POETRY_HOME=/home/cleanup -ENV POETRY_VIRTUALENVS_IN_PROJECT=true -ENV PATH="$POETRY_HOME/bin:$PATH" -RUN python -c 'from urllib.request import urlopen; print(urlopen("https://install.python-poetry.org").read().decode())' | python - -COPY resources ./ -RUN poetry install --no-interaction --no-ansi -vvv - -FROM builder AS runtime - -COPY --from=poetry /home/cleanup /home/cleanup -WORKDIR /home/cleanup -RUN mkdir ./cleanup -COPY scripts/*.py resources/start.sh ./ -RUN mkdir servicex_storage -COPY servicex_storage/* ./servicex_storage/ - -RUN chmod +x start.sh - -USER cleanup - -ENTRYPOINT ["./start.sh"] diff --git a/minio_cleanup/README.md b/minio_cleanup/README.md deleted file mode 100644 index 3ee91ef65..000000000 --- a/minio_cleanup/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# storage_cleanup -Microservice to cleanup storage used by ServiceX diff --git a/minio_cleanup/poetry.lock b/minio_cleanup/poetry.lock deleted file mode 100644 index e3886304f..000000000 --- a/minio_cleanup/poetry.lock +++ /dev/null @@ -1,246 +0,0 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. - -[[package]] -name = "argon2-cffi" -version = "23.1.0" -description = "Argon2 for Python" -optional = false -python-versions = ">=3.7" -files = [ - {file = "argon2_cffi-23.1.0-py3-none-any.whl", hash = "sha256:c670642b78ba29641818ab2e68bd4e6a78ba53b7eff7b4c3815ae16abf91c7ea"}, - {file = "argon2_cffi-23.1.0.tar.gz", hash = "sha256:879c3e79a2729ce768ebb7d36d4609e3a78a4ca2ec3a9f12286ca057e3d0db08"}, -] - -[package.dependencies] -argon2-cffi-bindings = "*" - -[package.extras] -dev = ["argon2-cffi[tests,typing]", "tox (>4)"] -docs = ["furo", "myst-parser", "sphinx", "sphinx-copybutton", "sphinx-notfound-page"] -tests = ["hypothesis", "pytest"] -typing = ["mypy"] - -[[package]] -name = "argon2-cffi-bindings" -version = "21.2.0" -description = "Low-level CFFI bindings for Argon2" -optional = false -python-versions = ">=3.6" -files = [ - {file = "argon2-cffi-bindings-21.2.0.tar.gz", hash = "sha256:bb89ceffa6c791807d1305ceb77dbfacc5aa499891d2c55661c6459651fc39e3"}, - {file = "argon2_cffi_bindings-21.2.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:ccb949252cb2ab3a08c02024acb77cfb179492d5701c7cbdbfd776124d4d2367"}, - {file = "argon2_cffi_bindings-21.2.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9524464572e12979364b7d600abf96181d3541da11e23ddf565a32e70bd4dc0d"}, - {file = "argon2_cffi_bindings-21.2.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b746dba803a79238e925d9046a63aa26bf86ab2a2fe74ce6b009a1c3f5c8f2ae"}, - {file = "argon2_cffi_bindings-21.2.0-cp36-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:58ed19212051f49a523abb1dbe954337dc82d947fb6e5a0da60f7c8471a8476c"}, - {file = "argon2_cffi_bindings-21.2.0-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:bd46088725ef7f58b5a1ef7ca06647ebaf0eb4baff7d1d0d177c6cc8744abd86"}, - {file = "argon2_cffi_bindings-21.2.0-cp36-abi3-musllinux_1_1_i686.whl", hash = "sha256:8cd69c07dd875537a824deec19f978e0f2078fdda07fd5c42ac29668dda5f40f"}, - {file = "argon2_cffi_bindings-21.2.0-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:f1152ac548bd5b8bcecfb0b0371f082037e47128653df2e8ba6e914d384f3c3e"}, - {file = "argon2_cffi_bindings-21.2.0-cp36-abi3-win32.whl", hash = "sha256:603ca0aba86b1349b147cab91ae970c63118a0f30444d4bc80355937c950c082"}, - {file = "argon2_cffi_bindings-21.2.0-cp36-abi3-win_amd64.whl", hash = "sha256:b2ef1c30440dbbcba7a5dc3e319408b59676e2e039e2ae11a8775ecf482b192f"}, - {file = "argon2_cffi_bindings-21.2.0-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:e415e3f62c8d124ee16018e491a009937f8cf7ebf5eb430ffc5de21b900dad93"}, - {file = "argon2_cffi_bindings-21.2.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:3e385d1c39c520c08b53d63300c3ecc28622f076f4c2b0e6d7e796e9f6502194"}, - {file = "argon2_cffi_bindings-21.2.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c3e3cc67fdb7d82c4718f19b4e7a87123caf8a93fde7e23cf66ac0337d3cb3f"}, - {file = "argon2_cffi_bindings-21.2.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6a22ad9800121b71099d0fb0a65323810a15f2e292f2ba450810a7316e128ee5"}, - {file = "argon2_cffi_bindings-21.2.0-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f9f8b450ed0547e3d473fdc8612083fd08dd2120d6ac8f73828df9b7d45bb351"}, - {file = "argon2_cffi_bindings-21.2.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:93f9bf70084f97245ba10ee36575f0c3f1e7d7724d67d8e5b08e61787c320ed7"}, - {file = "argon2_cffi_bindings-21.2.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:3b9ef65804859d335dc6b31582cad2c5166f0c3e7975f324d9ffaa34ee7e6583"}, - {file = "argon2_cffi_bindings-21.2.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d4966ef5848d820776f5f562a7d45fdd70c2f330c961d0d745b784034bd9f48d"}, - {file = "argon2_cffi_bindings-21.2.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20ef543a89dee4db46a1a6e206cd015360e5a75822f76df533845c3cbaf72670"}, - {file = "argon2_cffi_bindings-21.2.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ed2937d286e2ad0cc79a7087d3c272832865f779430e0cc2b4f3718d3159b0cb"}, - {file = "argon2_cffi_bindings-21.2.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:5e00316dabdaea0b2dd82d141cc66889ced0cdcbfa599e8b471cf22c620c329a"}, -] - -[package.dependencies] -cffi = ">=1.0.1" - -[package.extras] -dev = ["cogapp", "pre-commit", "pytest", "wheel"] -tests = ["pytest"] - -[[package]] -name = "certifi" -version = "2024.7.4" -description = "Python package for providing Mozilla's CA Bundle." -optional = false -python-versions = ">=3.6" -files = [ - {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"}, - {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"}, -] - -[[package]] -name = "cffi" -version = "1.16.0" -description = "Foreign Function Interface for Python calling C code." -optional = false -python-versions = ">=3.8" -files = [ - {file = "cffi-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6b3d6606d369fc1da4fd8c357d026317fbb9c9b75d36dc16e90e84c26854b088"}, - {file = "cffi-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ac0f5edd2360eea2f1daa9e26a41db02dd4b0451b48f7c318e217ee092a213e9"}, - {file = "cffi-1.16.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e61e3e4fa664a8588aa25c883eab612a188c725755afff6289454d6362b9673"}, - {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a72e8961a86d19bdb45851d8f1f08b041ea37d2bd8d4fd19903bc3083d80c896"}, - {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5b50bf3f55561dac5438f8e70bfcdfd74543fd60df5fa5f62d94e5867deca684"}, - {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7651c50c8c5ef7bdb41108b7b8c5a83013bfaa8a935590c5d74627c047a583c7"}, - {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4108df7fe9b707191e55f33efbcb2d81928e10cea45527879a4749cbe472614"}, - {file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:32c68ef735dbe5857c810328cb2481e24722a59a2003018885514d4c09af9743"}, - {file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:673739cb539f8cdaa07d92d02efa93c9ccf87e345b9a0b556e3ecc666718468d"}, - {file = "cffi-1.16.0-cp310-cp310-win32.whl", hash = "sha256:9f90389693731ff1f659e55c7d1640e2ec43ff725cc61b04b2f9c6d8d017df6a"}, - {file = "cffi-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:e6024675e67af929088fda399b2094574609396b1decb609c55fa58b028a32a1"}, - {file = "cffi-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b84834d0cf97e7d27dd5b7f3aca7b6e9263c56308ab9dc8aae9784abb774d404"}, - {file = "cffi-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1b8ebc27c014c59692bb2664c7d13ce7a6e9a629be20e54e7271fa696ff2b417"}, - {file = "cffi-1.16.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ee07e47c12890ef248766a6e55bd38ebfb2bb8edd4142d56db91b21ea68b7627"}, - {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8a9d3ebe49f084ad71f9269834ceccbf398253c9fac910c4fd7053ff1386936"}, - {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e70f54f1796669ef691ca07d046cd81a29cb4deb1e5f942003f401c0c4a2695d"}, - {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5bf44d66cdf9e893637896c7faa22298baebcd18d1ddb6d2626a6e39793a1d56"}, - {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7b78010e7b97fef4bee1e896df8a4bbb6712b7f05b7ef630f9d1da00f6444d2e"}, - {file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c6a164aa47843fb1b01e941d385aab7215563bb8816d80ff3a363a9f8448a8dc"}, - {file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e09f3ff613345df5e8c3667da1d918f9149bd623cd9070c983c013792a9a62eb"}, - {file = "cffi-1.16.0-cp311-cp311-win32.whl", hash = "sha256:2c56b361916f390cd758a57f2e16233eb4f64bcbeee88a4881ea90fca14dc6ab"}, - {file = "cffi-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:db8e577c19c0fda0beb7e0d4e09e0ba74b1e4c092e0e40bfa12fe05b6f6d75ba"}, - {file = "cffi-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956"}, - {file = "cffi-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:68e7c44931cc171c54ccb702482e9fc723192e88d25a0e133edd7aff8fcd1f6e"}, - {file = "cffi-1.16.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abd808f9c129ba2beda4cfc53bde801e5bcf9d6e0f22f095e45327c038bfe68e"}, - {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88e2b3c14bdb32e440be531ade29d3c50a1a59cd4e51b1dd8b0865c54ea5d2e2"}, - {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357"}, - {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b7be2d771cdba2942e13215c4e340bfd76398e9227ad10402a8767ab1865d2e6"}, - {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e715596e683d2ce000574bae5d07bd522c781a822866c20495e52520564f0969"}, - {file = "cffi-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2d92b25dbf6cae33f65005baf472d2c245c050b1ce709cc4588cdcdd5495b520"}, - {file = "cffi-1.16.0-cp312-cp312-win32.whl", hash = "sha256:b2ca4e77f9f47c55c194982e10f058db063937845bb2b7a86c84a6cfe0aefa8b"}, - {file = "cffi-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:68678abf380b42ce21a5f2abde8efee05c114c2fdb2e9eef2efdb0257fba1235"}, - {file = "cffi-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc"}, - {file = "cffi-1.16.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a09582f178759ee8128d9270cd1344154fd473bb77d94ce0aeb2a93ebf0feaf0"}, - {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e760191dd42581e023a68b758769e2da259b5d52e3103c6060ddc02c9edb8d7b"}, - {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:80876338e19c951fdfed6198e70bc88f1c9758b94578d5a7c4c91a87af3cf31c"}, - {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a6a14b17d7e17fa0d207ac08642c8820f84f25ce17a442fd15e27ea18d67c59b"}, - {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6602bc8dc6f3a9e02b6c22c4fc1e47aa50f8f8e6d3f78a5e16ac33ef5fefa324"}, - {file = "cffi-1.16.0-cp38-cp38-win32.whl", hash = "sha256:131fd094d1065b19540c3d72594260f118b231090295d8c34e19a7bbcf2e860a"}, - {file = "cffi-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:31d13b0f99e0836b7ff893d37af07366ebc90b678b6664c955b54561fc36ef36"}, - {file = "cffi-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:582215a0e9adbe0e379761260553ba11c58943e4bbe9c36430c4ca6ac74b15ed"}, - {file = "cffi-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b29ebffcf550f9da55bec9e02ad430c992a87e5f512cd63388abb76f1036d8d2"}, - {file = "cffi-1.16.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dc9b18bf40cc75f66f40a7379f6a9513244fe33c0e8aa72e2d56b0196a7ef872"}, - {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cb4a35b3642fc5c005a6755a5d17c6c8b6bcb6981baf81cea8bfbc8903e8ba8"}, - {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b86851a328eedc692acf81fb05444bdf1891747c25af7529e39ddafaf68a4f3f"}, - {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c0f31130ebc2d37cdd8e44605fb5fa7ad59049298b3f745c74fa74c62fbfcfc4"}, - {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f8e709127c6c77446a8c0a8c8bf3c8ee706a06cd44b1e827c3e6a2ee6b8c098"}, - {file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:748dcd1e3d3d7cd5443ef03ce8685043294ad6bd7c02a38d1bd367cfd968e000"}, - {file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8895613bcc094d4a1b2dbe179d88d7fb4a15cee43c052e8885783fac397d91fe"}, - {file = "cffi-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed86a35631f7bfbb28e108dd96773b9d5a6ce4811cf6ea468bb6a359b256b1e4"}, - {file = "cffi-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:3686dffb02459559c74dd3d81748269ffb0eb027c39a6fc99502de37d501faa8"}, - {file = "cffi-1.16.0.tar.gz", hash = "sha256:bcb3ef43e58665bbda2fb198698fcae6776483e0c4a631aa5647806c25e02cc0"}, -] - -[package.dependencies] -pycparser = "*" - -[[package]] -name = "minio" -version = "7.2.3" -description = "MinIO Python SDK for Amazon S3 Compatible Cloud Storage" -optional = false -python-versions = "*" -files = [ - {file = "minio-7.2.3-py3-none-any.whl", hash = "sha256:e6b5ce0a9b4368da50118c3f0c4df5dbf33885d44d77fce6c0aa1c485e6af7a1"}, - {file = "minio-7.2.3.tar.gz", hash = "sha256:4971dfb1a71eeefd38e1ce2dc7edc4e6eb0f07f1c1d6d70c15457e3280cfc4b9"}, -] - -[package.dependencies] -argon2-cffi = "*" -certifi = "*" -pycryptodome = "*" -typing-extensions = "*" -urllib3 = "*" - -[[package]] -name = "pycparser" -version = "2.21" -description = "C parser in Python" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -files = [ - {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, - {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, -] - -[[package]] -name = "pycryptodome" -version = "3.20.0" -description = "Cryptographic library for Python" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" -files = [ - {file = "pycryptodome-3.20.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:f0e6d631bae3f231d3634f91ae4da7a960f7ff87f2865b2d2b831af1dfb04e9a"}, - {file = "pycryptodome-3.20.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:baee115a9ba6c5d2709a1e88ffe62b73ecc044852a925dcb67713a288c4ec70f"}, - {file = "pycryptodome-3.20.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:417a276aaa9cb3be91f9014e9d18d10e840a7a9b9a9be64a42f553c5b50b4d1d"}, - {file = "pycryptodome-3.20.0-cp27-cp27m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2a1250b7ea809f752b68e3e6f3fd946b5939a52eaeea18c73bdab53e9ba3c2dd"}, - {file = "pycryptodome-3.20.0-cp27-cp27m-musllinux_1_1_aarch64.whl", hash = "sha256:d5954acfe9e00bc83ed9f5cb082ed22c592fbbef86dc48b907238be64ead5c33"}, - {file = "pycryptodome-3.20.0-cp27-cp27m-win32.whl", hash = "sha256:06d6de87c19f967f03b4cf9b34e538ef46e99a337e9a61a77dbe44b2cbcf0690"}, - {file = "pycryptodome-3.20.0-cp27-cp27m-win_amd64.whl", hash = "sha256:ec0bb1188c1d13426039af8ffcb4dbe3aad1d7680c35a62d8eaf2a529b5d3d4f"}, - {file = "pycryptodome-3.20.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:5601c934c498cd267640b57569e73793cb9a83506f7c73a8ec57a516f5b0b091"}, - {file = "pycryptodome-3.20.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:d29daa681517f4bc318cd8a23af87e1f2a7bad2fe361e8aa29c77d652a065de4"}, - {file = "pycryptodome-3.20.0-cp27-cp27mu-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3427d9e5310af6680678f4cce149f54e0bb4af60101c7f2c16fdf878b39ccccc"}, - {file = "pycryptodome-3.20.0-cp27-cp27mu-musllinux_1_1_aarch64.whl", hash = "sha256:3cd3ef3aee1079ae44afaeee13393cf68b1058f70576b11439483e34f93cf818"}, - {file = "pycryptodome-3.20.0-cp35-abi3-macosx_10_9_universal2.whl", hash = "sha256:ac1c7c0624a862f2e53438a15c9259d1655325fc2ec4392e66dc46cdae24d044"}, - {file = "pycryptodome-3.20.0-cp35-abi3-macosx_10_9_x86_64.whl", hash = "sha256:76658f0d942051d12a9bd08ca1b6b34fd762a8ee4240984f7c06ddfb55eaf15a"}, - {file = "pycryptodome-3.20.0-cp35-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f35d6cee81fa145333137009d9c8ba90951d7d77b67c79cbe5f03c7eb74d8fe2"}, - {file = "pycryptodome-3.20.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76cb39afede7055127e35a444c1c041d2e8d2f1f9c121ecef573757ba4cd2c3c"}, - {file = "pycryptodome-3.20.0-cp35-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:49a4c4dc60b78ec41d2afa392491d788c2e06edf48580fbfb0dd0f828af49d25"}, - {file = "pycryptodome-3.20.0-cp35-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:fb3b87461fa35afa19c971b0a2b7456a7b1db7b4eba9a8424666104925b78128"}, - {file = "pycryptodome-3.20.0-cp35-abi3-musllinux_1_1_i686.whl", hash = "sha256:acc2614e2e5346a4a4eab6e199203034924313626f9620b7b4b38e9ad74b7e0c"}, - {file = "pycryptodome-3.20.0-cp35-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:210ba1b647837bfc42dd5a813cdecb5b86193ae11a3f5d972b9a0ae2c7e9e4b4"}, - {file = "pycryptodome-3.20.0-cp35-abi3-win32.whl", hash = "sha256:8d6b98d0d83d21fb757a182d52940d028564efe8147baa9ce0f38d057104ae72"}, - {file = "pycryptodome-3.20.0-cp35-abi3-win_amd64.whl", hash = "sha256:9b3ae153c89a480a0ec402e23db8d8d84a3833b65fa4b15b81b83be9d637aab9"}, - {file = "pycryptodome-3.20.0-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:4401564ebf37dfde45d096974c7a159b52eeabd9969135f0426907db367a652a"}, - {file = "pycryptodome-3.20.0-pp27-pypy_73-win32.whl", hash = "sha256:ec1f93feb3bb93380ab0ebf8b859e8e5678c0f010d2d78367cf6bc30bfeb148e"}, - {file = "pycryptodome-3.20.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:acae12b9ede49f38eb0ef76fdec2df2e94aad85ae46ec85be3648a57f0a7db04"}, - {file = "pycryptodome-3.20.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f47888542a0633baff535a04726948e876bf1ed880fddb7c10a736fa99146ab3"}, - {file = "pycryptodome-3.20.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e0e4a987d38cfc2e71b4a1b591bae4891eeabe5fa0f56154f576e26287bfdea"}, - {file = "pycryptodome-3.20.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c18b381553638414b38705f07d1ef0a7cf301bc78a5f9bc17a957eb19446834b"}, - {file = "pycryptodome-3.20.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:a60fedd2b37b4cb11ccb5d0399efe26db9e0dd149016c1cc6c8161974ceac2d6"}, - {file = "pycryptodome-3.20.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:405002eafad114a2f9a930f5db65feef7b53c4784495dd8758069b89baf68eab"}, - {file = "pycryptodome-3.20.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2ab6ab0cb755154ad14e507d1df72de9897e99fd2d4922851a276ccc14f4f1a5"}, - {file = "pycryptodome-3.20.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:acf6e43fa75aca2d33e93409f2dafe386fe051818ee79ee8a3e21de9caa2ac9e"}, - {file = "pycryptodome-3.20.0.tar.gz", hash = "sha256:09609209ed7de61c2b560cc5c8c4fbf892f8b15b1faf7e4cbffac97db1fffda7"}, -] - -[[package]] -name = "python-logstash" -version = "0.4.8" -description = "Python logging handler for Logstash." -optional = false -python-versions = "*" -files = [ - {file = "python-logstash-0.4.8.tar.gz", hash = "sha256:d04e1ce11ecc107e4a4f3b807fc57d96811e964a554081b3bbb44732f74ef5f9"}, -] - -[[package]] -name = "typing-extensions" -version = "4.9.0" -description = "Backported and Experimental Type Hints for Python 3.8+" -optional = false -python-versions = ">=3.8" -files = [ - {file = "typing_extensions-4.9.0-py3-none-any.whl", hash = "sha256:af72aea155e91adfc61c3ae9e0e342dbc0cba726d6cba4b6c72c1f34e47291cd"}, - {file = "typing_extensions-4.9.0.tar.gz", hash = "sha256:23478f88c37f27d76ac8aee6c905017a143b0b1b886c3c9f66bc2fd94f9f5783"}, -] - -[[package]] -name = "urllib3" -version = "2.2.2" -description = "HTTP library with thread-safe connection pooling, file post, and more." -optional = false -python-versions = ">=3.8" -files = [ - {file = "urllib3-2.2.2-py3-none-any.whl", hash = "sha256:a448b2f64d686155468037e1ace9f2d2199776e17f0a46610480d311f73e3472"}, - {file = "urllib3-2.2.2.tar.gz", hash = "sha256:dd505485549a7a552833da5e6063639d0d177c04f23bc3864e41e5dc5f612168"}, -] - -[package.extras] -brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] -h2 = ["h2 (>=4,<5)"] -socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] -zstd = ["zstandard (>=0.18.0)"] - -[metadata] -lock-version = "2.0" -python-versions = "^3.11" -content-hash = "48eb091ca59d36b2fe93b1ba39eaa9204180a74e57993f54490d20c70620e43a" diff --git a/minio_cleanup/pyproject.toml b/minio_cleanup/pyproject.toml deleted file mode 100644 index 6f69ff6a6..000000000 --- a/minio_cleanup/pyproject.toml +++ /dev/null @@ -1,17 +0,0 @@ -[tool.poetry] -name = "minio_cleanup" -version = "0.1.0" -description = "" -authors = ["Suchandra Thapa "] - -[tool.poetry.dependencies] -python = "^3.11" -minio = "^7.1.12" -python-logstash = "^0.4.8" -urllib3 = "^2.2.2" - -[tool.poetry.dev-dependencies] - -[build-system] -requires = ["poetry-core>=1.0.0"] -build-backend = "poetry.core.masonry.api" diff --git a/minio_cleanup/requirements.txt b/minio_cleanup/requirements.txt deleted file mode 100644 index 9e347f602..000000000 --- a/minio_cleanup/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -minio -servicex_storage \ No newline at end of file diff --git a/minio_cleanup/resources/cleanup_batch.yaml b/minio_cleanup/resources/cleanup_batch.yaml deleted file mode 100644 index e70a42110..000000000 --- a/minio_cleanup/resources/cleanup_batch.yaml +++ /dev/null @@ -1,28 +0,0 @@ -apiVersion: batch/v1 -kind: CronJob -metadata: - name: minio-cleanup -spec: - schedule: "* */8 * * *" - jobTemplate: - spec: - template: - spec: - containers: - - name: minio-cleanup - image: cleanup:0.1 - imagePullPolicy: IfNotPresent - env: - - name: MINIO_URL - value: test - - name: ACCESS_KEY - value: access - - name: SECRET_KEY - value: secret - - name: MAX_AGE - value: 30 - - name: MAX_SIZE - value: '1G' - - name: NORM_SIZE - value: '700M' - restartPolicy: OnFailure \ No newline at end of file diff --git a/minio_cleanup/resources/start.sh b/minio_cleanup/resources/start.sh deleted file mode 100644 index e016de373..000000000 --- a/minio_cleanup/resources/start.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh -PATH=.venv/bin:$PATH -. .venv/bin/activate -env -python3.11 ./minio_cleanup.py --max-size $MAX_SIZE --norm-size $NORM_SIZE --max-age $MAX_AGE \ No newline at end of file diff --git a/minio_cleanup/scripts/minio_cleanup.py b/minio_cleanup/scripts/minio_cleanup.py deleted file mode 100755 index ef096d2f9..000000000 --- a/minio_cleanup/scripts/minio_cleanup.py +++ /dev/null @@ -1,243 +0,0 @@ -#!/usr/bin/python3.11 -# Copyright (c) 2019, IRIS-HEP -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import argparse -import logstash -import logging -import os -import sys - -from servicex_storage import s3_storage_manager - -instance = os.environ.get('INSTANCE_NAME', 'Unknown') - - -class StreamFormatter(logging.Formatter): - """ - A custom formatter that adds extras. - Normally log messages are "level instance component msg extra: {}" - """ - def_keys = ['name', 'msg', 'args', 'levelname', 'levelno', - 'pathname', 'filename', 'module', 'exc_info', - 'exc_text', 'stack_info', 'lineno', 'funcName', - 'created', 'msecs', 'relativeCreated', 'thread', - 'threadName', 'processName', 'process', 'message'] - - def format(self, record: logging.LogRecord) -> str: - """ - :param record: LogRecord - :return: formatted log message - """ - - string = super().format(record) - extra = {k: v for k, v in record.__dict__.items() - if k not in self.def_keys} - if len(extra) > 0: - string += " extra: " + str(extra) - return string - - -class LogstashFormatter(logstash.formatter.LogstashFormatterBase): - - def format(self, record): - message = { - '@timestamp': self.format_timestamp(record.created), - '@version': '1', - 'message': record.getMessage(), - 'path': record.pathname, - 'tags': self.tags, - 'type': self.message_type, - 'instance': instance, - 'component': 'minio cleaner', - - # Extra Fields - 'level': record.levelname, - 'logger_name': record.name, - } - - # Add extra fields - message.update(self.get_extra_fields(record)) - - # If exception, add debug info - if record.exc_info: - message.update(self.get_debug_fields(record)) - - return self.serialize(message) - - -# function to initialize logging -def initialize_logging() -> logging.Logger: - """ - Get a logger and initialize it so that it outputs the correct format - - :return: logger with correct formatting that outputs to console - """ - - log = logging.getLogger() - formatter = logging.Formatter('%(levelname)s ' + - f"{instance} minio cleaner " + '%(message)s') - handler = logging.StreamHandler() - handler.setFormatter(formatter) - handler.setLevel(logging.INFO) - log.addHandler(handler) - log.setLevel(logging.INFO) - return log - - -def strtobool(val: str) -> bool: - """Convert a string representation of truth to true (1) or false (0). - - True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values - are 'n', 'no', 'f', 'false', 'off', and '0'. Raises ValueError if - 'val' is anything else. - """ - val = val.lower() - if val in ('y', 'yes', 't', 'true', 'on', '1'): - return True - elif val in ('n', 'no', 'f', 'false', 'off', '0'): - return False - else: - raise ValueError("invalid truth value %r" % (val,)) - - -def parse_suffix(size: str) -> int: - """ - Take a string like 100M or 20G or 30T and return an integer - number of bytes that string represents - - raises ValueError if - :param size: a string with a M, G, T suffix indicating size - :return: integer number of bytes - """ - if size[-1] in ['M', 'G', 'T']: # process suffix - raw_max = float(size[:-1]) - mult = size[-1] - if mult == 'M': - raw_max *= 2 ** 20 - elif mult == 'G': - raw_max *= 2 ** 30 - elif mult == 'T': - raw_max *= 2 ** 40 - else: - raise ValueError - return int(raw_max) - else: - return int(size) - - -def run_minio_cleaner(): - """ - Run the minio cleaner - """ - - # Parse the command line arguments - parser = argparse.ArgumentParser() - parser.add_argument('--max-size', dest='max_size', action='store', - default='', - help='Max size allowed before pruning storage') - parser.add_argument('--norm-size', dest='norm_size', action='store', - default='', - help='Size to prune storage to') - parser.add_argument('--max-age', dest='max_age', action='store', - default=30, - type=int, - help='Max age of files in days allowed before pruning storage') - - logstash_host = os.environ.get('LOGSTASH_HOST') - logstash_port = os.environ.get('LOGSTASH_PORT') - level = os.environ.get('LOG_LEVEL', 'INFO').upper() - - stream_handler = logging.StreamHandler() - stream_formatter = StreamFormatter('%(levelname)s ' + - f"{instance} minio_cleanup " + - '%(message)s') - stream_handler.setFormatter(stream_formatter) - stream_handler.setLevel(level) - logger.addHandler(stream_handler) - - if logstash_host and logstash_port: - logstash_handler = logstash.TCPLogstashHandler(logstash_host, logstash_port, version=1) - logstash_formatter = LogstashFormatter('logstash', None, None) - logstash_handler.setFormatter(logstash_formatter) - logstash_handler.setLevel(level) - logger.addHandler(logstash_handler) - - args = parser.parse_args() - try: - raw_max = parse_suffix(args.max_size) - except ValueError: - logger.error(f"Can't parse max size, got: {args.max_size}") - sys.exit(1) - try: - raw_norm = parse_suffix(args.norm_size) - except ValueError: - logger.error(f"Can't parse norm size, got: {args.norm_size}") - sys.exit(1) - - logger.info("ServiceX Minio Cleaner starting up.") - - env_vars = ['MINIO_URL', 'ACCESS_KEY', 'SECRET_KEY'] - error = False - for var in env_vars: - if var not in os.environ: - logger.error(f"{var} not found in environment") - error = True - if error: - logger.error("Exiting due to missing environment variables") - sys.exit(1) - - try: - if 'MINIO_ENCRYPT' in os.environ: - if isinstance(os.environ['MINIO_ENCRYPT'], bool): - use_https = os.environ['MINIO_ENCRYPT'] - else: - use_https = strtobool(os.environ['MINIO_ENCRYPT']) - else: - use_https = False - - store = s3_storage_manager.S3Store(s3_endpoint=os.environ['MINIO_URL'], - access_key=os.environ['ACCESS_KEY'], - secret_key=os.environ['SECRET_KEY'], - use_https=use_https) - logger.info("cleanup started") - results = store.cleanup_storage(max_size=raw_max, norm_size=raw_norm, max_age=args.max_age) - logger.info("cleanup stopped finished.", extra={ - "storage used": results[0], - "storage available": raw_max, - "storage HWM": raw_norm}) - for bucket in results[1]: - logger.info(f"Removed folder/bucket: {bucket}") - logger.info("Deleted buckets", extra={'nbuckets': len(results[1])}) - finally: - logger.info('Done running minio storage cleanup') - - -if __name__ == "__main__": - logger = initialize_logging() - run_minio_cleaner() diff --git a/minio_cleanup/servicex_storage/__init__.py b/minio_cleanup/servicex_storage/__init__.py deleted file mode 100644 index 794b0f111..000000000 --- a/minio_cleanup/servicex_storage/__init__.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (c) 2021, IRIS-HEP -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/minio_cleanup/servicex_storage/object_storage_manager.py b/minio_cleanup/servicex_storage/object_storage_manager.py deleted file mode 100644 index 900364eee..000000000 --- a/minio_cleanup/servicex_storage/object_storage_manager.py +++ /dev/null @@ -1,119 +0,0 @@ -""" -Definition for abstract Object storage manager class -""" - - -# Copyright (c) 2019, IRIS-HEP -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import abc -import pathlib -from typing import List -from typing import Tuple - - -class ObjectStore(abc.ABC): - """ - Abstract class for object storage managers to use - """ - @abc.abstractmethod - def get_storage_used(self) -> int: - """ - Get storage used by object store - :return: total storage used in bytes - """ - - @abc.abstractmethod - def upload_file(self, bucket: str, object_name: str, path: pathlib.Path) -> None: - """ - Save file to object store - :param bucket: name of bucket - :param object_name: name of object - :param path: path to source file - :return: None - """ - - @abc.abstractmethod - def cleanup_storage(self, - max_size: int, norm_size: int, max_age: int) -> Tuple[int, List[str]]: - """ - Reduce storage used until it's less than max_size - :param max_size: Maximum amount of storage to use before trying to clean up - :param norm_size: when this size is achieved, stop removing files - :param max_age: Maximum number of days a bucket can be before it is cleaned up - :return: Tuple with final storage used and list of buckets removed - """ - - @abc.abstractmethod - def delete_object(self, bucket: str, object_name: str) -> None: - """ - Delete object from store - :param bucket: name of bucket - :param object_name: name of object - :return: None - """ - - @abc.abstractmethod - def delete_objects(self, bucket: str, object_names: List[str]) -> List[Tuple[str, str]]: - """ - Delete object from store - :param bucket: name of bucket - :param object_names: name of object - :return: List of booleans indicating whether each corresponding object was deleted - """ - - @abc.abstractmethod - def get_file(self, bucket: str, object_name: str, path: pathlib.Path) -> None: - """ - Get an object from store - :param bucket: name of bucket - :param object_name: name of object - :param path: path to destination file (must not be present) - :return: None - """ - - @abc.abstractmethod - def get_buckets(self) -> List[str]: - """ - Get an list of buckets from store - :return: List of buckets - """ - - @abc.abstractmethod - def create_bucket(self, bucket: str) -> bool: - """ - Create a bucket with given id - :return: True on success, False otherwise - """ - - @abc.abstractmethod - def delete_bucket(self, bucket: str) -> bool: - """ - Get delete bucket from store - :return: True on success, False otherwise - """ diff --git a/minio_cleanup/servicex_storage/s3_storage_manager.py b/minio_cleanup/servicex_storage/s3_storage_manager.py deleted file mode 100644 index 092cbd510..000000000 --- a/minio_cleanup/servicex_storage/s3_storage_manager.py +++ /dev/null @@ -1,235 +0,0 @@ -""" -Implementation of storage manager for minio based storage -""" - -import datetime -import logging -import os -import pathlib -import concurrent.futures -from typing import List -from typing import Tuple -from collections import namedtuple - -import minio -from minio.deleteobjects import DeleteObject - -from servicex_storage import object_storage_manager - -BucketInfo = namedtuple('BucketInfo', ['name', 'size', 'last_modified']) - - -class S3Store(object_storage_manager.ObjectStore): - """ - Class to handle operations for minio storage - """ - - def __init__(self, - s3_endpoint: str, access_key: str, secret_key: str, use_https: bool = False): - super().__init__() - - self.logger = logging.getLogger(__name__) - self.logger.addHandler(logging.NullHandler()) - - self.s3_endpoint = s3_endpoint - self.access_key = access_key - self.secret_key = secret_key - - # s3 client is thread safe using Threading, not so much with multiprocessing - self.__s3_client = minio.Minio(self.s3_endpoint, - access_key=self.access_key, - secret_key=self.secret_key, - secure=use_https) - - # set up threads to use - if "THREADS" in os.environ: - try: - self.__threads = int(os.environ["THREADS"]) - self.logger.debug("Using %d threads for storage cleanup", self.__threads) - except ValueError: - self.logger.exception("THREADS env variable not a number, using a single thread") - self.__threads = 1 - else: - self.__threads = 1 - - def get_bucket_info(self, bucket: str) -> BucketInfo: - """ - Given a bucket, get the size and last modified date - :param bucket: bucket name - :return: None - """ - - objects = self.__s3_client.list_objects(bucket) - size = 0 - last_modified = datetime.datetime.now(datetime.timezone.utc) - for obj in objects: - if obj.object_name[-1] == '/': - # this is a bucket within a bucket, skip because this isn't generated - # by ServiceX - continue - result = self.__s3_client.stat_object(obj.bucket_name, obj.object_name) - size += result.size - if result.last_modified < last_modified: - last_modified = result.last_modified - return BucketInfo(name=bucket, size=size, last_modified=last_modified) - - def delete_bucket(self, bucket: str) -> bool: - """ - Delete a given bucket and contents from minio - :param bucket: bucket name - :return: None - """ - if not self.__s3_client.bucket_exists(bucket): - return True - delete_objects = map(lambda x: DeleteObject(x.object_name), - self.__s3_client.list_objects(bucket)) - errors = 0 - for error in self.__s3_client.remove_objects(bucket, delete_objects): - errors += 1 - if errors != 0: - return False - self.__s3_client.remove_bucket(bucket) - return True - - def get_storage_used(self) -> int: - """ - Get the number of bytes used - - :return: integer with number of bytes used - """ - buckets = self.__s3_client.list_buckets() - if len(buckets) == 0: - return 0 - - # must use ThreadPool since minio client is thread safe with threading only - with concurrent.futures.ThreadPoolExecutor(max_workers=self.__threads) as executor: - sizes = executor.map(lambda x: self.get_bucket_info(x).size, buckets) - total_size = sum(sizes) - return total_size - - def delete_object(self, bucket: str, object_name: str) -> None: - """ - Remove object from minio storage - :param bucket: name of bucket - :param object_name: name of object - :return: None - """ - self.__s3_client.remove_object(bucket, object_name) - - def delete_objects(self, bucket: str, object_names: List[str]) -> List[Tuple[str, str]]: - """ - Delete object from store - :param bucket: name of bucket - :param object_names: name of object - :return: List of tuples (objectName, error_message) - """ - delete_objects = [DeleteObject(x) for x in object_names] - delete_results = self.__s3_client.remove_objects(bucket, delete_objects) - return [(x.name, x.message) for x in delete_results] - - def get_file(self, bucket: str, object_name: str, path: pathlib.Path) -> None: - """ - Get object from minio and save to given path - :param bucket: bucket name - :param object_name: object name - :param path: path to save - :return: None - """ - try: - resp = self.__s3_client.fget_object(bucket, object_name, path) - except Exception: # pylint: disable=broad-except - self.logger.exception("Got an exception while getting object") - finally: - resp.close() # pylint: disable=no-member - resp.release_conn() # pylint: disable=no-member - - def upload_file(self, bucket: str, object_name: str, path: pathlib.Path) -> None: - """ - Upload file to minio storage - :param bucket: bucket name - :param object_name: destination object name - :param path: path of file source - :return: None - """ - if not os.path.isfile(path): - mesg = f"Can't upload {path}: not present or not a file" - self.logger.error(mesg) - raise IOError(mesg) - self.__s3_client.fput_object(bucket, object_name, path) - - def cleanup_storage(self, - max_size: int, norm_size: int, max_age: int) -> Tuple[int, List[str]]: - """ - Clean up storage by removing old files until below max_size - :param max_size: max amount of storage that can be used before trying to clean up - :param norm_size: when this size is achieved, stop removing files - :param max_age: max number of days a bucket can be before it is deleted - :return: Tuple with final size of storage used and list of buckets removed - """ - buckets = map(lambda x: x.name, self.__s3_client.list_buckets()) - cleaned_buckets = [] - # must use ThreadPool since minio client is thread safe with threading only - with concurrent.futures.ThreadPoolExecutor(max_workers=self.__threads) as executor: - bucket_list = executor.map(self.get_bucket_info, buckets) - - # concurrently delete any old buckets - with concurrent.futures.ThreadPoolExecutor(max_workers=self.__threads) as executor: - kept_buckets = [] - old_buckets = [] - for bucket in bucket_list: - bucket_age = (datetime.datetime.now( - datetime.timezone.utc) - bucket.last_modified).days - if bucket_age > max_age: - old_buckets.append((bucket.name, bucket_age)) - else: - kept_buckets.append(bucket) - - futures = { - executor.submit(lambda x: self.delete_bucket(x), bucket[0]): (bucket[0], bucket[1]) - for bucket in old_buckets - } - for future in concurrent.futures.as_completed(futures): - bucket_info = futures[future] - try: - deleted = future.result() - # use mesg in both log outputs with different capitalizations of D - mesg = f"eleting {bucket_info[0]} due to age: {bucket_info[1]} days" - if deleted: - self.logger.info("D%s", mesg) - cleaned_buckets.append(bucket_info[0]) - else: - self.logger.error("Error d%s", mesg) - except Exception: # pylint: disable=broad-except - self.logger.exception( - "Received exception while deleting %s due to age", bucket_info[0]) - - kept_buckets.sort(key=lambda x: x.last_modified) - idx = 0 - current_size = sum(map(lambda x: x.size, kept_buckets)) - if current_size > max_size: - while current_size > norm_size and idx < len(kept_buckets): - bucket = kept_buckets[idx] - self.logger.info("Deleting %s due to storage limits", bucket.name) - self.delete_bucket(bucket.name) - cleaned_buckets.append(bucket.name) - current_size -= bucket.size - idx += 1 - return current_size, cleaned_buckets - - def get_buckets(self) -> List[str]: - """ - Get list of buckets in s3 - :return: list of bucket names - """ - return [x.name for x in self.__s3_client.list_buckets()] - - def create_bucket(self, bucket: str) -> bool: - """ - Create a bucket with given id - :return: None - """ - try: - self.__s3_client.make_bucket(bucket) - return True - except: - return False diff --git a/minio_cleanup/tests/test_minio_storage_manager.py b/minio_cleanup/tests/test_minio_storage_manager.py deleted file mode 100644 index 083075e4b..000000000 --- a/minio_cleanup/tests/test_minio_storage_manager.py +++ /dev/null @@ -1,117 +0,0 @@ -import datetime -import unittest -from collections import namedtuple - -from unittest.mock import patch - -import servicex_storage.s3_storage_manager - -ObjectInfo = namedtuple('ObjectInfo', ['size', 'last_modified']) -s3_fake_objects = { - "bucket1": { - "object1": ObjectInfo(size=10, - last_modified=datetime.datetime(year=2021, month=10, day=1, hour=10, minute=10, second=10)), - "object2": ObjectInfo(size=20, - last_modified=datetime.datetime(year=2021, month=10, day=1, hour=10, minute=11, second=10)), - "object3": ObjectInfo(size=30, - last_modified=datetime.datetime(year=2021, month=10, day=1, hour=10, minute=12, second=10)), - }, - "bucket2": { - "object4": ObjectInfo(size=100, - last_modified=datetime.datetime(year=2020, month=10, day=1, hour=10, minute=10, second=10)), - "object5": ObjectInfo(size=200, - last_modified=datetime.datetime(year=2020, month=10, day=1, hour=10, minute=11, second=10)), - "object6": ObjectInfo(size=300, - last_modified=datetime.datetime(year=2020, month=10, day=1, hour=10, minute=12, second=10)), - } -} - - -class MyTestCase(unittest.TestCase): - @patch('minio.Minio') - def test_s3_get_bucket_info(self, mock_class): - """ - Test s3's get bucket info - :return: None - """ - - mock_class().list_objects.return_value = list(s3_fake_objects["bucket1"].keys()) - mock_class().stat_object.side_effect = list(s3_fake_objects["bucket1"].values()) - return_value = servicex_storage.s3_storage_manager.BucketInfo(name="bucket1", - size=60, - last_modified=datetime.datetime( - year=2021, month=10, - day=1, hour=10, - minute=10, second=10)) - test_obj = servicex_storage.s3_storage_manager.S3Store(s3_endpoint="abc", - access_key="abc", - secret_key="abc") - bucket_info = test_obj.get_bucket_info("bucket1") - self.assertEqual(bucket_info, return_value) - - @patch('minio.Minio') - def test_minio_get_storage_used(self, mock_class): - """ - Test getting storage used by a s3 bucket - :return: None - """ - mock_class().list_buckets.return_value = list(s3_fake_objects.keys()) - mock_class().list_objects.side_effect = [list(s3_fake_objects["bucket1"].keys()), - list(s3_fake_objects["bucket2"].keys())] - mock_class().stat_object.side_effect = list(s3_fake_objects["bucket1"].values()) + \ - list(s3_fake_objects["bucket2"].values()) - - test_obj = servicex_storage.s3_storage_manager.S3Store(s3_endpoint="abc", - access_key="abc", - secret_key="abc") - - bucket_size = test_obj.get_storage_used() - self.assertEqual(bucket_size, 660) - - @patch('minio.Minio') - def test_s3_cleanup_storage(self, mock_class): - """ - Test minio's get bucket info - :return: None - """ - current_s3_fake_objects = { - "bucket1": { - "object1": ObjectInfo(size=10, - last_modified=datetime.datetime.utcnow()), - "object2": ObjectInfo(size=20, - last_modified=datetime.datetime.utcnow()), - "object3": ObjectInfo(size=30, - last_modified=datetime.datetime.utcnow()), - }, - "bucket2": { - "object4": ObjectInfo(size=100, - last_modified=datetime.datetime(year=2020, month=10, day=1, hour=10, minute=10, - second=10)), - "object5": ObjectInfo(size=200, - last_modified=datetime.datetime(year=2020, month=10, day=1, hour=10, minute=11, - second=10)), - "object6": ObjectInfo(size=300, - last_modified=datetime.datetime(year=2020, month=10, day=1, hour=10, minute=12, - second=10)), - } - } - - mock_class().list_buckets.return_value = list(current_s3_fake_objects.keys()) - mock_class().list_objects.side_effect = [list(current_s3_fake_objects["bucket1"].keys()), - list(current_s3_fake_objects["bucket2"].keys()), - list(current_s3_fake_objects["bucket2"].keys())] - mock_class().stat_object.side_effect = list(current_s3_fake_objects["bucket1"].values()) + \ - list(current_s3_fake_objects["bucket2"].values()) - - test_obj = servicex_storage.s3_storage_manager.S3Store(s3_endpoint="abc", - access_key="abc", - secret_key="abc") - - final_size = test_obj.cleanup_storage(70, 60, 365)[0] - self.assertEqual(final_size, 60) - mock_class().remove_objects.assert_called_with( - "bucket2", ["object4", "object5", "object6"]) - - -if __name__ == '__main__': - unittest.main() diff --git a/servicex_app/servicex_app/models.py b/servicex_app/servicex_app/models.py index cd8f81b37..7b5e1cb6a 100644 --- a/servicex_app/servicex_app/models.py +++ b/servicex_app/servicex_app/models.py @@ -33,7 +33,8 @@ from typing import Iterable, List, Optional, Union from flask_sqlalchemy import SQLAlchemy -from sqlalchemy import DateTime, ForeignKey, func +from sqlalchemy import DateTime, func +from sqlalchemy import ForeignKey from sqlalchemy.orm import relationship from sqlalchemy.orm.exc import NoResultFound @@ -168,10 +169,9 @@ class TransformRequest(db.Model): request_id = db.Column(db.String(48), unique=True, nullable=False, index=True) title = db.Column(db.String(128), nullable=True) submit_time = db.Column(db.DateTime, nullable=False) - archived = db.Column(db.Boolean, nullable=False, default=False) finish_time = db.Column(db.DateTime, nullable=True) did = db.Column(db.String(512), unique=False, nullable=False) - did_id = db.Column(db.Integer, unique=False, nullable=False) + did_id = db.Column(db.Integer, ForeignKey('datasets.id'), unique=False, nullable=False) selection = db.Column(db.String(max_string_size), unique=False, nullable=True) tree_name = db.Column(db.String(512), unique=False, nullable=True) image = db.Column(db.String(128), nullable=True) @@ -195,6 +195,8 @@ class TransformRequest(db.Model): transformer_language = db.Column(db.String(256), nullable=True) transformer_command = db.Column(db.String(256), nullable=True) + dataset = relationship("Dataset", back_populates="transform_requests") + def save_to_db(self): db.session.add(self) db.session.commit() @@ -415,7 +417,9 @@ class Dataset(db.Model): events = db.Column(db.BigInteger, default=0, nullable=True) lookup_status = db.Column(db.Enum(DatasetStatus), nullable=False) stale = db.Column(db.Boolean, default=False, nullable=False) + files = relationship("DatasetFile", back_populates="dataset") + transform_requests = relationship("TransformRequest", back_populates="dataset") def save_to_db(self): db.session.add(self) diff --git a/servicex_app/servicex_app/resources/internal/data_lifecycle_ops.py b/servicex_app/servicex_app/resources/internal/data_lifecycle_ops.py new file mode 100644 index 000000000..91f45d6ca --- /dev/null +++ b/servicex_app/servicex_app/resources/internal/data_lifecycle_ops.py @@ -0,0 +1,126 @@ +# Copyright (c) 2024, IRIS-HEP +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from datetime import datetime +from typing import List + +from flask import request, current_app +from sqlalchemy import select, exists +from sqlalchemy.orm import Session + +from servicex_app import ObjectStoreManager +from servicex_app.models import TransformRequest, Dataset, db, TransformationResult, \ + DatasetFile +from servicex_app.resources.servicex_resource import ServiceXResource + + +class DataLifecycleOps(ServiceXResource): + @classmethod + def make_api(cls, object_store_manager): + cls.object_store = object_store_manager + + @staticmethod + def delete_expired_transforms(session: Session, + object_store: ObjectStoreManager, + cutoff_timestamp: datetime) -> List[str]: + deleted_log = [] + + with session.begin(): + expired_transforms = session.query(TransformRequest).filter( + TransformRequest.submit_time <= cutoff_timestamp + ).all() + + for transform in expired_transforms: + with session.begin(): + # Delete all the results for this transform + session.query(TransformationResult).filter_by( + request_id=transform.request_id).delete() + + # Delete the transformed files out of object store along with the bucket + if object_store: + object_store.delete_bucket_and_contents(transform.request_id) + + # Delete the transform request + session.delete(transform) + + deleted_log.append(f"{transform.submitter_name} - {transform.request_id}: {transform.title}") + return deleted_log + + def delete_orphaned_datasets(self, session: Session): + deleted_datasets = [] + + with session.begin(): + # Use a subquery to find the transforms that are using this dataset + subquery = ( + select(1) + .where(TransformRequest.did_id == Dataset.id) + ) + + # And then use ~exist to narrow down to the datasets that are not being used + query = ( + select(Dataset) + .where(~exists(subquery)) + ) + orphaned_datasets = session.execute(query).scalars().all() + + for dataset in orphaned_datasets: + with session.begin(): + session.query(DatasetFile).filter_by( + dataset_id=dataset.id).delete() + + session.delete(dataset) + + deleted_datasets.append(f"{dataset.name} - {dataset.id}") + + return deleted_datasets + + def post(self): + """ + Delete all data older than the given timestamp + """ + + # Start by deleting all the expired transforms + cutoff_timestamp = datetime.fromisoformat(request.args['cutoff_timestamp']) + deleted_log = self.delete_expired_transforms( + session=db.session, + object_store=self.object_store, + cutoff_timestamp=cutoff_timestamp) + + if deleted_log: + current_app.logger.info("Deleted expired transforms", extra={'deleted': deleted_log}) + else: + current_app.logger.info("No expired transforms found") + + # Now lets see if there are any orphaned datasets and delete them + deleted_datasets = self.delete_orphaned_datasets(db.session) + + if deleted_datasets: + current_app.logger.info("Deleted orphaned datasets", extra={'deleted': deleted_datasets}) + else: + current_app.logger.info("No orphaned datasets found") + + return {'deleted_transforms': deleted_log, 'deleted_datasets': deleted_datasets} diff --git a/servicex_app/servicex_app/resources/transformation/archive.py b/servicex_app/servicex_app/resources/transformation/delete.py similarity index 58% rename from servicex_app/servicex_app/resources/transformation/archive.py rename to servicex_app/servicex_app/resources/transformation/delete.py index 48ec58074..ebb5ebba2 100644 --- a/servicex_app/servicex_app/resources/transformation/archive.py +++ b/servicex_app/servicex_app/resources/transformation/delete.py @@ -27,43 +27,46 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from servicex_app import ObjectStoreManager from servicex_app.decorators import auth_required -from servicex_app.models import TransformRequest +from servicex_app.models import TransformRequest, TransformationResult, db from servicex_app.resources.servicex_resource import ServiceXResource from flask import current_app -class ArchiveTransform(ServiceXResource): +class DeleteTransform(ServiceXResource): @classmethod def make_api(cls, object_store_manager: ObjectStoreManager): cls.object_store = object_store_manager @auth_required def delete(self, request_id: str): - transform_req = TransformRequest.lookup(request_id) - if not transform_req: - msg = f'Transformation request not found with id: {request_id}' - current_app.logger.warning(msg, extra={'requestId': request_id}) - return {'message': msg}, 404 + session = db.session + with session.begin(): - if transform_req.archived: - msg = f'Transformation request with id: {request_id} is already archived.' - current_app.logger.warning(msg, extra={'requestId': request_id}) - return {'message': msg}, 404 + transform_req = TransformRequest.lookup(request_id) + if not transform_req: + msg = f'Transformation request not found with id: {request_id}' + current_app.logger.warning(msg, extra={'requestId': request_id}) + return {'message': msg}, 404 - if not transform_req.status.is_complete: - msg = f"Transform request with id {request_id} is still in progress." - current_app.logger.warning(msg, extra={'requestId': request_id}) - return {"message": msg}, 400 + if not transform_req.status.is_complete: + msg = f"Transform request with id {request_id} is still in progress." + current_app.logger.warning(msg, extra={'requestId': request_id}) + return {"message": msg}, 400 - user = self.get_requesting_user() - if user and (not user.admin and user.id != transform_req.submitted_by): - return {"message": "You are not authorized to delete this request"}, 403 + user = self.get_requesting_user() + if user and (not user.admin and user.id != transform_req.submitted_by): + return {"message": "You are not authorized to delete this request"}, 403 - transform_req.archived = True - transform_req.save_to_db() - transform_req.truncate_results() - if self.object_store: - self.object_store.delete_bucket_and_contents(transform_req.request_id) + # Delete all the results for this transform + session.query(TransformationResult).filter_by( + request_id=transform_req.request_id).delete() + + # Delete the transformed files out of object store along with the bucket + if self.object_store: + self.object_store.delete_bucket_and_contents(transform_req.request_id) + + # Delete the transform request + session.delete(transform_req) return { "message": f"Transform request with id {request_id} has been archived." }, 200 diff --git a/servicex_app/servicex_app/resources/transformation/get_all.py b/servicex_app/servicex_app/resources/transformation/get_all.py index 619b990ec..52db08700 100644 --- a/servicex_app/servicex_app/resources/transformation/get_all.py +++ b/servicex_app/servicex_app/resources/transformation/get_all.py @@ -50,5 +50,5 @@ def get(self): transforms = TransformRequest.query.filter_by(submitted_by=query_id) else: current_app.logger.debug("Querying for all transform requests") - transforms = TransformRequest.query.filter_by(archived=False) + transforms = TransformRequest.query.all() return TransformRequest.return_json(transforms) diff --git a/servicex_app/servicex_app/resources/transformation/get_one.py b/servicex_app/servicex_app/resources/transformation/get_one.py index 917632303..7f6958b40 100644 --- a/servicex_app/servicex_app/resources/transformation/get_one.py +++ b/servicex_app/servicex_app/resources/transformation/get_one.py @@ -45,11 +45,6 @@ def get(self, request_id): current_app.logger.error(msg, extra={'requestId': request_id}) return {'message': msg}, 404 - if transform.archived: - msg = f'Transformation request with id: {request_id} is archived' - current_app.logger.error(msg, extra={'requestId': request_id}) - return {'message': msg}, 404 - transform_json = transform.to_json() if current_app.config['OBJECT_STORE_ENABLED'] and \ transform_json['result-destination'] == TransformRequest.OBJECT_STORE_DEST: diff --git a/servicex_app/servicex_app/resources/transformation/status.py b/servicex_app/servicex_app/resources/transformation/status.py index 133228a3c..329480fe4 100644 --- a/servicex_app/servicex_app/resources/transformation/status.py +++ b/servicex_app/servicex_app/resources/transformation/status.py @@ -47,11 +47,6 @@ def get(self, request_id): current_app.logger.error(msg, extra={'requestId': request_id}) return {'message': msg}, 404 - if transform.archived: - msg = f'Transformation request with id: {request_id} is archived' - current_app.logger.error(msg, extra={'requestId': request_id}) - return {'message': msg}, 404 - status_request = status_request_parser.parse_args() # Format timestamps with military timezone, given that they are in UTC. diff --git a/servicex_app/servicex_app/resources/transformation/submit.py b/servicex_app/servicex_app/resources/transformation/submit.py index 87ddabd22..72e3585d0 100644 --- a/servicex_app/servicex_app/resources/transformation/submit.py +++ b/servicex_app/servicex_app/resources/transformation/submit.py @@ -176,7 +176,6 @@ def post(self): request_rec = TransformRequest( request_id=str(request_id), title=args.get("title"), - archived=False, did=dataset_manager.name, did_id=dataset_manager.id, submit_time=datetime.now(tz=timezone.utc), diff --git a/servicex_app/servicex_app/routes.py b/servicex_app/servicex_app/routes.py index 9192671e6..114acfe33 100644 --- a/servicex_app/servicex_app/routes.py +++ b/servicex_app/servicex_app/routes.py @@ -30,7 +30,8 @@ from servicex_app.resources.datasets.delete_dataset import DeleteDataset from servicex_app.resources.datasets.get_all import AllDatasets from servicex_app.resources.datasets.get_one import OneDataset -from servicex_app.resources.transformation.archive import ArchiveTransform +from servicex_app.resources.transformation.delete import DeleteTransform +from servicex_app.resources.internal.data_lifecycle_ops import DataLifecycleOps def add_routes(api, transformer_manager, rabbit_mq_adaptor, @@ -139,8 +140,8 @@ def add_routes(api, transformer_manager, rabbit_mq_adaptor, prefix += "/" api.add_resource(TransformationRequest, prefix) - ArchiveTransform.make_api(object_store) - api.add_resource(ArchiveTransform, prefix) + DeleteTransform.make_api(object_store) + api.add_resource(DeleteTransform, prefix) api.add_resource(TransformationStatus, prefix + "/status") @@ -165,3 +166,6 @@ def add_routes(api, transformer_manager, rabbit_mq_adaptor, TransformerFileComplete.make_api(transformer_manager) api.add_resource(TransformerFileComplete, '/servicex/internal/transformation//file-complete') + + DataLifecycleOps.make_api(object_store) + api.add_resource(DataLifecycleOps, '/servicex/internal/data-lifecycle') diff --git a/servicex_app/servicex_app_test/resource_test_base.py b/servicex_app/servicex_app_test/resource_test_base.py index b80e00985..900edd3e5 100644 --- a/servicex_app/servicex_app_test/resource_test_base.py +++ b/servicex_app/servicex_app_test/resource_test_base.py @@ -149,6 +149,7 @@ def _generate_transform_request(): transform_request.workers = 42 transform_request.workflow_name = "func_adl" transform_request.did = '123-456-789' + transform_request.did_id = 1234 transform_request.image = 'ssl-hep/foo:latest' transform_request.result_format = 'arrow' transform_request.result_destination = "object-store" @@ -163,7 +164,6 @@ def _generate_transform_request(): transform_request.transformer_language = "scala" transform_request.transformer_command = "echo" transform_request.selection = "(cool (is LISP))" - transform_request.archived = False return transform_request @staticmethod diff --git a/servicex_app/servicex_app_test/resources/internal/test_data_lifecycle_ops.py b/servicex_app/servicex_app_test/resources/internal/test_data_lifecycle_ops.py new file mode 100644 index 000000000..53ac8bb84 --- /dev/null +++ b/servicex_app/servicex_app_test/resources/internal/test_data_lifecycle_ops.py @@ -0,0 +1,268 @@ +# Copyright (c) 2019, IRIS-HEP +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from datetime import datetime +from unittest.mock import patch, ANY + +from pytest import fixture, mark + +from servicex_app.models import Dataset, TransformationResult, TransformRequest, \ + DatasetFile +from servicex_app_test.resource_test_base import ResourceTestBase +from servicex_app.resources.internal.data_lifecycle_ops import DataLifecycleOps + + +class TestDataLifecycleOps(ResourceTestBase): + module = "servicex_app.resources.internal.data_lifecycle_ops" + + @fixture(scope="function") + def db_session(self): + from sqlalchemy import create_engine + from servicex_app.models import db + engine = create_engine("sqlite:///:memory:") + db.metadata.create_all(engine) + from sqlalchemy.orm import sessionmaker + Session = sessionmaker(bind=engine) + session = Session() + + yield session + + session.close() + db.metadata.drop_all(engine) + + @fixture + def mock_db(self, mocker): + db = mocker.patch(f"{self.module}.db") + db.session = mocker.Mock() + db.session.begin = mocker.MagicMock() + return db + + @fixture + def mock_session(self, mocker): + return mocker.MagicMock() + + @fixture + def mock_find_expired(self, mocker): + mock_transform_request_cls = mocker.patch(f"{self.module}.TransformRequest") + transform = self._generate_transform_request() + mock_transform_request_cls.find_expired.return_value = [transform] + return mock_transform_request_cls.find_expired + + @fixture + def mock_find_orphaned(self, mocker, mock_db): + # Create the mock + mock_execute = mocker.Mock() + + results = [ + Dataset(last_used=datetime(2022, 1, 1), + last_updated=datetime(2022, 1, 1), + id='123', + name='dataset1', + events=100, + size=1000, + n_files=1, + lookup_status='looking', + did_finder='rucio') + ] + + # Chain the mocks to match the call pattern + mock_execute.return_value.scalars.return_value.all.return_value = results + + mock_db.session.execute = mock_execute + return results + + @fixture + def mock_transform_result(self, mocker): + mock_transform_result_cls = mocker.patch(f"{self.module}.TransformationResult") + mock_transform_result_cls.query = mocker.MagicMock() + mock_transform_result_cls.query.filter_by = mocker.MagicMock() + return mock_transform_result_cls + + @fixture + def mock_dataset_files(self, mocker): + mock_transform_result_cls = mocker.patch(f"{self.module}.DatasetFile") + mock_transform_result_cls.query = mocker.MagicMock() + mock_transform_result_cls.query.filter_by = mocker.MagicMock() + return mock_transform_result_cls + + @fixture + def mock_select(self, mocker): + return mocker.patch(f"{self.module}.select") + + @fixture + def mock_exists(self, mocker): + return mocker.patch(f"{self.module}.exists") + + @fixture + def insert_transforms(self, db_session): + active_transform = self._generate_transform_request() + active_transform.submit_time = datetime(2022, 1, 1, 0, 0) + active_transform.request_id = 1 + active_transform.did_id = 1 + active_transform.title = 'active' + db_session.add(active_transform) + + active_result = TransformationResult( + request_id=1, + file_path='file_path', + transform_status='complete', + ) + db_session.add(active_result) + + stale_transform = self._generate_transform_request() + stale_transform.submit_time = datetime(2021, 1, 1, 0, 0) + stale_transform.request_id = 2 + stale_transform.did_id = 1 + stale_transform.title = 'stale' + db_session.add(stale_transform) + stale_result = TransformationResult( + request_id=2, + file_path='file_path', + transform_status='complete', + ) + db_session.add(stale_result) + + db_session.commit() + return {"active": active_transform, "stale": stale_transform} + + @fixture + def insert_datasets(self, db_session, insert_transforms): + dataset = Dataset( + last_used=datetime(2022, 1, 1), + last_updated=datetime(2022, 1, 1), + id=1, + name='not-orphaned', + events=100, + size=1000, + n_files=1, + lookup_status='looking', + did_finder='rucio' + ) + db_session.add(dataset) + db_session.add(DatasetFile( + dataset_id=1, + paths='file_path' + )) + + dataset = Dataset( + last_used=datetime(2022, 1, 1), + last_updated=datetime(2022, 1, 1), + id=2, + name='orphaned', + events=100, + size=1000, + n_files=1, + lookup_status='looking', + did_finder='rucio' + ) + db_session.add(dataset) + db_session.add(DatasetFile( + dataset_id=2, + paths='file_path' + )) + + db_session.commit() + return dataset + + @mark.parametrize("use_object_store", [ + True, # Enable Object store + False # No object store + ]) + def test_expired_transforms(self, use_object_store, mocker, insert_transforms, + db_session): + + data_life_cycle_ops = DataLifecycleOps() + + mock_object_store = mocker.MagicMock() if use_object_store else None + + response = data_life_cycle_ops.delete_expired_transforms(db_session, + mock_object_store, + cutoff_timestamp=datetime + .fromisoformat('2021-01-01T00:00:00')) + + assert len(response) == 1 + remaining_transforms = db_session.query(TransformRequest).all() + assert len(remaining_transforms) == 1 + assert remaining_transforms[0].title == 'active' + + remaining_results = db_session.query(TransformationResult).all() + assert len(remaining_results) == 1 + if use_object_store: + mock_object_store.delete_bucket_and_contents.assert_called_with('2') + + def test_orphaned_datasets(self, insert_datasets, db_session): + data_life_cycle_ops = DataLifecycleOps() + response = data_life_cycle_ops.delete_orphaned_datasets(db_session) + assert len(response) == 1 + + remaining_datasets = db_session.query(Dataset).all() + assert len(remaining_datasets) == 1 + assert remaining_datasets[0].name == 'not-orphaned' + + assert len(db_session.query(DatasetFile).all()) == 1 + + @patch('servicex_app.resources.internal.data_lifecycle_ops.DataLifecycleOps.delete_expired_transforms', + return_value=['expired']) + @patch('servicex_app.resources.internal.data_lifecycle_ops.DataLifecycleOps.delete_orphaned_datasets', + return_value=['orphaned']) + def test_post(self, mock_orphaned, mock_delete_expired, mocker): + client = self._test_client() + with client.application.app_context(): + response = client.post('/servicex/internal/data-lifecycle', + query_string={'cutoff_timestamp': '2021-01-01T00:00:00'}) + + assert response.status_code == 200 + assert response.json == { + "deleted_transforms": ['expired'], + "deleted_datasets": ['orphaned'] + } + mock_delete_expired.assert_called_with(session=ANY, + object_store=None, + cutoff_timestamp=datetime.fromisoformat('2021-01-01T00:00:00')) + + mock_orphaned.assert_called_with(ANY) + + @patch('servicex_app.resources.internal.data_lifecycle_ops.DataLifecycleOps.delete_expired_transforms', + return_value=[]) + @patch('servicex_app.resources.internal.data_lifecycle_ops.DataLifecycleOps.delete_orphaned_datasets', + return_value=[]) + def test_post_no_op(self, mock_orphaned, mock_delete_expired, mocker): + client = self._test_client() + with client.application.app_context(): + response = client.post('/servicex/internal/data-lifecycle', + query_string={'cutoff_timestamp': '2021-01-01T00:00:00'}) + + assert response.status_code == 200 + assert response.json == { + "deleted_transforms": [], + "deleted_datasets": [] + } + mock_delete_expired.assert_called_with(session=ANY, + object_store=None, + cutoff_timestamp=datetime.fromisoformat('2021-01-01T00:00:00')) + + mock_orphaned.assert_called_with(ANY) diff --git a/servicex_app/servicex_app_test/resources/transformation/test_archive.py b/servicex_app/servicex_app_test/resources/transformation/test_delete.py similarity index 72% rename from servicex_app/servicex_app_test/resources/transformation/test_archive.py rename to servicex_app/servicex_app_test/resources/transformation/test_delete.py index 061b24843..e391cc10b 100644 --- a/servicex_app/servicex_app_test/resources/transformation/test_archive.py +++ b/servicex_app/servicex_app_test/resources/transformation/test_delete.py @@ -6,8 +6,8 @@ from servicex_app_test.resource_test_base import ResourceTestBase -class TestTransformArchive(ResourceTestBase): - module = "servicex_app.resources.transformation.archive" +class TestTransformDelete(ResourceTestBase): + module = "servicex_app.resources.transformation.delete" @pytest.fixture def mock_object_store_manager(self, mocker) -> MagicMock: @@ -17,12 +17,16 @@ def mock_object_store_manager(self, mocker) -> MagicMock: def fake_transform(self, mocker) -> TransformRequest: mock_transform_request_cls = mocker.patch(f"{self.module}.TransformRequest") transform = self._generate_transform_request() - transform.save_to_db = MagicMock() - transform.truncate_results = MagicMock() mock_transform_request_cls.lookup.return_value = transform return transform - def test_archive(self, fake_transform, mock_object_store_manager): + @pytest.fixture + def db_session(self, mocker): + mock_db = mocker.patch(f"{self.module}.db") + mock_db.session = mocker.MagicMock() + return mock_db.session + + def test_delete(self, fake_transform, db_session, mock_object_store_manager): fake_transform.status = TransformStatus.complete local_config = { @@ -37,12 +41,12 @@ def test_archive(self, fake_transform, mock_object_store_manager): resp = client.delete("/servicex/transformation/BR549") assert resp.status_code == 200 - assert fake_transform.archived - assert fake_transform.save_to_db.called - assert fake_transform.truncate_results.called + db_session.delete.assert_called_once_with(fake_transform) + db_session.query().filter_by.assert_called_once_with(request_id="BR549") + db_session.query().filter_by.return_value.delete.assert_called_once_with() mock_object_store_manager.delete_bucket_and_contents.assert_called_once_with("BR549") - def test_running(self, fake_transform): + def test_running(self, fake_transform, db_session): fake_transform.status = TransformStatus.running client = self._test_client() @@ -50,22 +54,8 @@ def test_running(self, fake_transform): resp = client.delete("/servicex/transformation/BR549") assert resp.status_code == 400 assert resp.json["message"] == "Transform request with id BR549 is still in progress." - assert not fake_transform.archived - assert not fake_transform.save_to_db.called - assert not fake_transform.truncate_results.called - - def test_already_archived(self, fake_transform): - fake_transform.status = TransformStatus.running - fake_transform.archived = True - - client = self._test_client() - - resp = client.delete("/servicex/transformation/BR549") - assert resp.status_code == 404 - assert resp.json["message"] == "Transformation request with id: BR549 is already archived." - assert fake_transform.archived - assert not fake_transform.save_to_db.called - assert not fake_transform.truncate_results.called + assert not db_session.query().filter_by.return_value.delete.called + assert not db_session.delete.called def test_not_found(self): client = self._test_client() @@ -83,6 +73,7 @@ def test_submit_transformation_auth_enabled(self, user_id, submitter_id, is_admin, expected_status, fake_transform, + db_session, mock_jwt_extended, mock_requesting_user): fake_transform.status = TransformStatus.complete client = self._test_client(extra_config={'ENABLE_AUTH': True}) diff --git a/servicex_app/servicex_app_test/resources/transformation/test_get_one.py b/servicex_app/servicex_app_test/resources/transformation/test_get_one.py index 1df6217a8..3d3e4df52 100644 --- a/servicex_app/servicex_app_test/resources/transformation/test_get_one.py +++ b/servicex_app/servicex_app_test/resources/transformation/test_get_one.py @@ -50,12 +50,3 @@ def test_get_single_request_404(self, mocker, client): response = client.get('/servicex/transformation/1234') assert response.status_code == 404 mock_lookup.assert_called_with('1234') - - def test_get_single_request_archived(self, mocker, client): - with patch('servicex_app.models.TransformRequest.lookup') as mock_lookup: - fake_transform_request = self._generate_transform_request() - fake_transform_request.archived = True - mock_lookup.return_value = fake_transform_request - response = client.get('/servicex/transformation/1234') - assert response.status_code == 404 - mock_lookup.assert_called_with('1234') diff --git a/servicex_app/servicex_app_test/resources/transformation/test_status.py b/servicex_app/servicex_app_test/resources/transformation/test_status.py index 3d4563fff..644b3cf56 100644 --- a/servicex_app/servicex_app_test/resources/transformation/test_status.py +++ b/servicex_app/servicex_app_test/resources/transformation/test_status.py @@ -54,16 +54,3 @@ def test_get_status_404(self, mocker, client): response = client.get('/servicex/transformation/1234/status') assert response.status_code == 404 mock_transform_request_read.assert_called_with("1234") - - def test_get_status_archived(self, mocker, client): - import servicex_app - fake_transform_request = self._generate_transform_request() - fake_transform_request.archived = True - mock_transform_request_read = mocker.patch.object( - servicex_app.models.TransformRequest, - 'lookup', - return_value=fake_transform_request) - - response = client.get('/servicex/transformation/1234/status') - assert response.status_code == 404 - mock_transform_request_read.assert_called_with("1234") diff --git a/servicex_app/servicex_app_test/test_decorators.py b/servicex_app/servicex_app_test/test_decorators.py index 6dd80cff2..68d5ceee2 100644 --- a/servicex_app/servicex_app_test/test_decorators.py +++ b/servicex_app/servicex_app_test/test_decorators.py @@ -52,7 +52,6 @@ def test_auth_decorator_integration_auth_disabled(self, mocker, client): mock = mocker.patch('servicex_app.resources.transformation.get_one' '.TransformRequest.lookup').return_value mock.to_json.return_value = data - mock.archived = False with client.application.app_context(): response: Response = client.get(f'servicex/transformation/{fake_transform_id}') print(response.data) @@ -91,7 +90,6 @@ def test_auth_decorator_integration_authorized(self, mocker, user): mock = mocker.patch('servicex_app.resources.transformation.get_one' '.TransformRequest.lookup').return_value mock.submitted_by = user.id - mock.archived = False mock.to_json.return_value = data with client.application.app_context(): response: Response = client.get(f'servicex/transformation/{fake_transform_id}', @@ -107,7 +105,6 @@ def test_auth_decorator_integration_oauth(self, mocker, user): mock = mocker.patch('servicex_app.resources.transformation.get_one' '.TransformRequest.lookup').return_value mock.submitted_by = user.id - mock.archived = False mock.to_json.return_value = data with client.session_transaction() as sess: sess['is_authenticated'] = True