From 5491e5c3dbf7a7c4c23e65ef2309a5879ba4684b Mon Sep 17 00:00:00 2001 From: David Benedeki <14905969+benedeki@users.noreply.github.com> Date: Fri, 22 Nov 2024 21:58:19 +0100 Subject: [PATCH] #291: Project improvements after 0.3.0 release (#300) * set credentials persistence to `false` in GitHub checkout actions * fixed release notes presence check GitHub workflow (using new action for it) * added supported Atum Agent control functions list to documentation * added new grouping of issues into release notes draft * added badges to `README.md` * CODEOWNERS update * ignoring `AgentServerCompatibilityTests` * conditional load of some modules based on Java version * run tests on all Scala versions --------- Co-authored-by: Ladislav Sulak --- .github/CODEOWNERS | 2 +- .github/workflows/build.yml | 20 ++-- .github/workflows/format_check.yml | 3 +- .github/workflows/jacoco_report.yml | 2 + .github/workflows/license_check.yml | 4 +- .../pr_release_note_comment_check.yml | 94 ------------------- .../release-notes-presence-check.yml | 44 +++++++++ .github/workflows/release_draft.yml | 11 ++- .github/workflows/release_publish.yml | 2 + .github/workflows/test_filenames_check.yml | 4 +- README.md | 75 +++++++++++++++ .../agent/AgentServerCompatibilityTests.scala | 2 +- build.sbt | 85 ++++++++++------- project/Setup.scala | 5 + 14 files changed, 214 insertions(+), 139 deletions(-) delete mode 100644 .github/workflows/pr_release_note_comment_check.yml create mode 100644 .github/workflows/release-notes-presence-check.yml diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index cec71281c..ab470496d 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1 +1 @@ -* @benedeki @lsulak @TebaleloS @Zejnilovic @dk1844 @salamonpavel +* @benedeki @lsulak @Zejnilovic @dk1844 @salamonpavel @ABLL526 diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 31ee0e50b..849bd9e0c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -30,7 +30,8 @@ jobs: - name: Checkout code uses: actions/checkout@v4 with: - fetch-depth: 0 + fetch-depth: 1 + persist-credentials: false - uses: coursier/cache-action@v5 - name: Setup Scala @@ -38,11 +39,11 @@ jobs: with: java-version: "adopt@1.8" - - name: Build and run unit tests - run: sbt "project model" test doc "project reader" test doc "project agent_spark3" test doc + - name: Build and run tests + run: sbt testAll - - name: Build and run integration tests - run: sbt "project model" testIT "project reader" testIT "project agent_spark3" testIT + - name: Generate documenation + run: sbt doc test-database-and-server: name: Test Database and Server @@ -64,7 +65,8 @@ jobs: - name: Checkout code uses: actions/checkout@v4 with: - fetch-depth: 0 + fetch-depth: 1 + persist-credentials: false - uses: coursier/cache-action@v5 - name: Setup Scala @@ -73,10 +75,14 @@ jobs: java-version: "adopt@1.11.0-11" - name: Build and run unit tests - run: sbt "project database" test doc "project server" test doc + run: sbt "project database" test "project server" test - name: Prepare testing database run: sbt flywayMigrate - name: Build and run integration tests run: sbt "project database" testIT "project server" testIT + + - name: Generate documentation + run: sbt "project database" doc "project server" doc + diff --git a/.github/workflows/format_check.yml b/.github/workflows/format_check.yml index a93ce1783..12090ccfe 100644 --- a/.github/workflows/format_check.yml +++ b/.github/workflows/format_check.yml @@ -27,8 +27,9 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v4 with: + persist-credentials: false fetch-depth: 0 ref: ${{ github.event.pull_request.head.ref }} diff --git a/.github/workflows/jacoco_report.yml b/.github/workflows/jacoco_report.yml index 80f08b2f2..0f3157b95 100644 --- a/.github/workflows/jacoco_report.yml +++ b/.github/workflows/jacoco_report.yml @@ -50,6 +50,8 @@ jobs: steps: - name: Checkout code uses: actions/checkout@v4 + with: + persist-credentials: false - name: Setup Scala uses: olafurpg/setup-scala@v14 with: diff --git a/.github/workflows/license_check.yml b/.github/workflows/license_check.yml index 36a4f4d5f..3113d4886 100644 --- a/.github/workflows/license_check.yml +++ b/.github/workflows/license_check.yml @@ -27,7 +27,9 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v4 + with: + persist-credentials: false - name: Setup Scala uses: olafurpg/setup-scala@v10 with: diff --git a/.github/workflows/pr_release_note_comment_check.yml b/.github/workflows/pr_release_note_comment_check.yml deleted file mode 100644 index 4dc08f526..000000000 --- a/.github/workflows/pr_release_note_comment_check.yml +++ /dev/null @@ -1,94 +0,0 @@ -# -# Copyright 2021 ABSA Group Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -name: PR Release Note Comment Check - -on: - issue_comment: - types: - - created - - edited - - deleted - pull_request: - types: - - opened - - synchronize - - reopened - - edited - - labeled - - unlabeled - branches: [ master ] - -jobs: - check-for-release-notes-comments: - if: ${{ ( github.event_name == 'pull_request') || (github.event.issue.pull_request) }} - name: Check For Release Notes Comments - runs-on: ubuntu-latest - steps: - - name: Get PR branch - uses: xt0rted/pull-request-comment-branch@v1 - id: comment-branch - - - name: Set latest commit status as pending - uses: myrotvorets/set-commit-status-action@master - with: - sha: ${{ steps.comment-branch.outputs.head_sha }} - token: ${{ secrets.GITHUB_TOKEN }} - status: pending - - - name: Fetch all PR comments - if: ${{ ! contains( github.event.pull_request.labels.*.name, 'no RN') }} - id: get-comments - uses: actions/github-script@v7 - with: - github-token: ${{secrets.GITHUB_TOKEN}} - script: | - const issueNumber = context.issue.number; - const repoName = context.repo.repo; - const repoOwner = context.repo.owner; - - const comments = await github.rest.issues.listComments({ - owner: repoOwner, - repo: repoName, - issue_number: issueNumber, - }); - - return comments.data.map(comment => comment.body); - - - name: Check for 'Release Notes' in comments - if: ${{ ! contains( github.event.pull_request.labels.*.name, 'no RN') }} - uses: actions/github-script@v7 - with: - script: | - const comments = ${{ steps.get-comments.outputs.result }}; - console.log("Comments:"); - console.log(comments); - const releaseNotesRegex = /release notes?:?/i; - const hasReleaseNotes = comments.some(comment => releaseNotesRegex.test(comment)); - - if (!hasReleaseNotes) { - console.log('No "Release notes" found in PR comments'); - core.setFailed('No "Release notes" found in PR comments') - } else { - console.log('"Release notes" found in comments'); - } - - name: Set latest commit status as ${{ job.status }} - uses: myrotvorets/set-commit-status-action@master - if: always() - with: - sha: ${{ steps.comment-branch.outputs.head_sha }} - token: ${{ secrets.GITHUB_TOKEN }} - status: ${{ job.status }} diff --git a/.github/workflows/release-notes-presence-check.yml b/.github/workflows/release-notes-presence-check.yml new file mode 100644 index 000000000..f60618756 --- /dev/null +++ b/.github/workflows/release-notes-presence-check.yml @@ -0,0 +1,44 @@ +# +# Copyright 2021 ABSA Group Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Release Notes Presence Check + +on: + pull_request: + types: [opened, synchronize, reopened, edited, labeled, unlabeled] + branches: [ master ] + +env: + SKIP_LABEL: 'no RN' + RLS_NOTES_TAG_REGEX: 'Release Notes:' + +jobs: + release-notes-presence-check: + name: Release Notes Presence Check + runs-on: ubuntu-latest + + steps: + - uses: actions/setup-python@v5.1.1 + with: + python-version: '3.11' + + - name: Check presence of release notes in PR description + uses: AbsaOSS/release-notes-presence-check@v0.1.0 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + github-repository: ${{ github.repository }} + pr-number: ${{ github.event.number }} diff --git a/.github/workflows/release_draft.yml b/.github/workflows/release_draft.yml index aa303469c..95055bfa1 100644 --- a/.github/workflows/release_draft.yml +++ b/.github/workflows/release_draft.yml @@ -28,6 +28,7 @@ jobs: steps: - uses: actions/checkout@v4 with: + persist-credentials: false fetch-depth: 0 # the following step is disabled because it doesn't order the version tags correctly # - name: Validate format of received tag @@ -104,6 +105,7 @@ jobs: steps: - uses: actions/checkout@v4 with: + persist-credentials: false fetch-depth: 0 ref: refs/tags/${{ github.event.inputs.tagName }} @@ -119,10 +121,17 @@ jobs: with: tag-name: ${{ github.event.inputs.tagName }} chapters: '[ + {"title": "No entry 🚫", "label": "duplicate"}, + {"title": "No entry 🚫", "label": "invalid"}, + {"title": "No entry 🚫", "label": "wontfix"}, + {"title": "No entry 🚫", "label": "no RN"}, {"title": "Breaking Changes 💥", "label": "breaking-change"}, {"title": "New Features 🎉", "label": "enhancement"}, {"title": "New Features 🎉", "label": "feature"}, - {"title": "Bugfixes 🛠", "label": "bug"} + {"title": "Bugfixes 🛠", "label": "bug"}, + {"title": "Infrastructure ⚙️", "label": "infrastructure"}, + {"title": "Silent-live 🤫", "label": "silent-live"}, + {"title": "Documentation 📜", "label": "documentation"} ]' duplicity-scope: 'service' duplicity-icon: '🔁' diff --git a/.github/workflows/release_publish.yml b/.github/workflows/release_publish.yml index b349a8ff6..3a68d8cf2 100644 --- a/.github/workflows/release_publish.yml +++ b/.github/workflows/release_publish.yml @@ -27,6 +27,7 @@ jobs: - name: Checkout code uses: actions/checkout@v4 with: + persist-credentials: false fetch-depth: 0 - uses: coursier/cache-action@v5 @@ -51,6 +52,7 @@ jobs: - name: Checkout code uses: actions/checkout@v4 with: + persist-credentials: false fetch-depth: 0 - uses: coursier/cache-action@v5 diff --git a/.github/workflows/test_filenames_check.yml b/.github/workflows/test_filenames_check.yml index ed0a061af..03d509cd1 100644 --- a/.github/workflows/test_filenames_check.yml +++ b/.github/workflows/test_filenames_check.yml @@ -27,7 +27,9 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v4 + with: + persist-credentials: false - name: Filename Inspector id: scan-test-files diff --git a/README.md b/README.md index 46dfc975b..5703b5954 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,19 @@ # Atum Service +[![Build](https://github.com/AbsaOSS/spark-commons/actions/workflows/build.yml/badge.svg)](https://github.com/AbsaOSS/spark-commons/actions/workflows/build.yml) +[![License](http://img.shields.io/:license-apache-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0.html) +[![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://GitHub.com/Naereen/StrapDown.js/graphs/commit-activity) + +| Atum Server | Atum Agent | Atum Model | Atum Reader | +|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [![GitHub release](https://img.shields.io/github/release/AbsaOSS/atum-service.svg)](https://GitHub.com/AbsaOSS/atum-service/releases/) | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/za.co.absa.atum-service/atum-agent-spark3_2.13/badge.svg)](https://central.sonatype.com/search?q=atum-agent&namespace=za.co.absa.atum-service) | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/za.co.absa.atum-service/atum-model_2.13/badge.svg)](https://central.sonatype.com/search?q=atum-model&namespace=za.co.absa.atum-service) | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/za.co.absa.atum-service/atum-reader_2.13/badge.svg)](https://central.sonatype.com/search?q=atum-reader&namespace=za.co.absa.atum-service) | + + + + - [Atum Service](#atum-service) + - [Motivation](#motivation) + - [Features](#features) - [Modules](#modules) - [Agent `agent/`](#agent-agent) - [Reader `reader/`](#agent-agent) @@ -15,6 +28,9 @@ - [Measurement](#measurement) - [Checkpoint](#checkpoint) - [Data Flow](#data-flow) + - [Usage](#usage) + - [Atum Agent routines](#atum-agent-routines) + - [Control measurement types](#control-measurement-types) - [How to generate Code coverage report](#how-to-generate-code-coverage-report) - [How to Run in IntelliJ](#how-to-run-in-intellij) - [How to Run Tests](#how-to-run-tests) @@ -41,6 +57,39 @@ functions and are stored on a single central place, in a relational database. Co checkpoints is not only helpful for complying with strict regulatory frameworks, but also helps during development and debugging of your Spark-based data processing. +## Motivation + +Big Data strategy for a company usually includes data gathering and ingestion processes. +That is the definition of how data from different systems operating inside a company +are gathered and stored for further analysis and reporting. An ingestion processes can involve +various transformations like: +* Converting between data formats (XML, CSV, etc.) +* Data type casting, for example converting XML strings to numeric values +* Joining reference tables. For example this can include enriching existing + data with additional information available through dictionary mappings. + This constitutes a common ETL (Extract, Transform and Load) process. + +During such transformations, sometimes data can get corrupted (e.g. during casting), records can +get added or lost. For instance, *outer joining* a table holding duplicate keys can result in records explosion. +And *inner joining* a table which has no matching keys for some records will result in loss of records. + +In regulated industries it is crucial to ensure data integrity and accuracy. For instance, in the banking industry +the BCBS set of regulations requires analysis and reporting to be based on data accuracy and integrity principles. +Thus it is critical at the ingestion stage to preserve the accuracy and integrity of the data gathered from a +source system. + +The purpose of Atum is to provide means of ensuring no critical fields have been modified during the processing and no +records are added or lost. To do this the library provides an ability to calculate *control numbers* of explicitly +specified columns using a selection of agregate function. We call the set of such measurements at a given time +a *checkpoint* and each value - a result of the function computation - we call a *control measurement*. Checkpoints can +be calculated anytime between Spark transformations and actions, so as at the start of the process or after its end. + +We assume the data for ETL are processed in a series of batch jobs. Let's call each data set for a given batch +job a *batch*. All checkpoints are calculated for a specific batch. + +## Features + +TBD ## Modules @@ -157,6 +206,32 @@ The journey of a dataset throughout various data transformations and pipelines. even if it involves multiple applications or ETL pipelines. +## Usage + +### Atum Agent routines + +TBD + +### Control measurement types + +The control measurement of one or more columns is an aggregation function result executed over the dataset. It can be +calculated differently depending on the column's data type, on business requirements and function used. This table +represents all currently supported measurement types (aka measures): + +| Type | Description | +|------------------------------------|:--------------------------------------------------------------| +| AtumMeasure.RecordCount | Calculates the number of rows in the dataset | +| AtumMeasure.DistinctRecordCount | Calculates DISTINCT(COUNT(()) of the specified column | +| AtumMeasure.SumOfValuesOfColumn | Calculates SUM() of the specified column | +| AtumMeasure.AbsSumOfValuesOfColumn | Calculates SUM(ABS()) of the specified column | +| AtumMeasure.SumOfHashesOfColumn | Calculates SUM(CRC32()) of the specified column | +| Measure.UnknownMeasure | Custom measure where the data are provided by the application | + +[//]: # (| controlType.aggregatedTruncTotal | Calculates SUM(TRUNC()) of the specified column |) + +[//]: # (| controlType.absAggregatedTruncTotal | Calculates SUM(TRUNC(ABS())) of the specified column |) + + ## How to generate Code coverage report ```sbt sbt jacoco diff --git a/agent/src/test/scala/za/co/absa/atum/agent/AgentServerCompatibilityTests.scala b/agent/src/test/scala/za/co/absa/atum/agent/AgentServerCompatibilityTests.scala index 992aabe12..d720100f1 100644 --- a/agent/src/test/scala/za/co/absa/atum/agent/AgentServerCompatibilityTests.scala +++ b/agent/src/test/scala/za/co/absa/atum/agent/AgentServerCompatibilityTests.scala @@ -40,7 +40,7 @@ class AgentServerCompatibilityTests extends DBTestSuite { .add(StructField("columnForSum", DoubleType)) // Need to add service & pg run in CI - test("Agent should be compatible with server") { + ignore("Agent should be compatible with server") { val expectedMeasurement = JsonBString( """{"mainValue": {"value": "4", "valueType": "Long"}, "supportValues": {}}""".stripMargin diff --git a/build.sbt b/build.sbt index 0c2f6b1ee..839e11b56 100644 --- a/build.sbt +++ b/build.sbt @@ -20,19 +20,25 @@ import Dependencies.* import Dependencies.Versions.spark3 import VersionAxes.* -ThisBuild / scalaVersion := Setup.scala213.asString // default version TODO +ThisBuild / scalaVersion := Setup.scala213.asString ThisBuild / versionScheme := Some("early-semver") Global / onChangedBuildSource := ReloadOnSourceChanges +val limitedProject: Boolean = Setup.currentJava < Setup.recommendedJava + initialize := { val _ = initialize.value // Ensure previous initializations are run - val requiredJavaVersion = VersionNumber("11") - val currentJavaVersion = VersionNumber(sys.props("java.specification.version")) - println(s"Running on Java version $currentJavaVersion, required is at least version $requiredJavaVersion") - //this routine can be used to assert the required Java version + assert(Setup.currentJava >= Setup.requiredJava, + s"Running on Java version ${Setup.currentJava}, required is at least version ${Setup.requiredJava}, recommended is ${Setup.recommendedJava}") + + if (limitedProject) { + val log = Keys.sLog.value + log.warn(s"Some nodules will not be loaded, because they require at least Java ${Setup.recommendedJava} while Java ${Setup.currentJava} has been found") + log.warn("""Affected modules are: "atum-server", "atum-database"""") + } } enablePlugins(FlywayPlugin) @@ -47,23 +53,31 @@ libraryDependencies ++= flywayDependencies /** * Module `server` is the service application that collects and stores measured data And upo request retrives them */ -lazy val server = (projectMatrix in file("server")) - .settings( - Setup.commonSettings ++ Seq( - name := "atum-server", - javacOptions ++= Setup.serverAndDbJavacOptions, - Compile / packageBin / publishArtifact := false, - packageBin := (Compile / assembly).value, - artifactPath / (Compile / packageBin) := baseDirectory.value / s"target/${name.value}-${version.value}.jar", - testFrameworks += new TestFramework("zio.test.sbt.ZTestFramework"), - Setup.serverMergeStrategy, - publish / skip := true - ): _* - ) - .enablePlugins(AssemblyPlugin) - .enablePlugins(AutomateHeaderPlugin) - .addSingleScalaBuild(Setup.serverAndDbScalaVersion, Dependencies.serverDependencies) - .dependsOn(model) +lazy val server = { + val server = (projectMatrix in file("server")) + .settings( + Setup.commonSettings ++ Seq( + name := "atum-server", + javacOptions ++= Setup.serverAndDbJavacOptions, + Compile / packageBin / publishArtifact := false, + packageBin := (Compile / assembly).value, + artifactPath / (Compile / packageBin) := baseDirectory.value / s"target/${name.value}-${version.value}.jar", + testFrameworks += new TestFramework("zio.test.sbt.ZTestFramework"), + Setup.serverMergeStrategy, + publish / skip := true + ): _* + ) + .enablePlugins(AssemblyPlugin) + .enablePlugins(AutomateHeaderPlugin) + .addSingleScalaBuild(Setup.serverAndDbScalaVersion, Dependencies.serverDependencies) + .dependsOn(model) + + if (limitedProject) { + null // if value other then null is returned, the condition doesn't seem to work. + } else { + server + } +} /** * Module `agent` is the library to be plugged into the Spark application to measure the data and send it to the server @@ -95,16 +109,23 @@ lazy val model = (projectMatrix in file("model")) /** * Module `database` is the source of database structures of the service */ -lazy val database = (projectMatrix in file("database")) - .disablePlugins(sbtassembly.AssemblyPlugin) - .settings( - Setup.commonSettings ++ Seq( - name := "atum-database", - javacOptions ++= Setup.serverAndDbJavacOptions, - publish / skip := true - ): _* - ) - .addSingleScalaBuild(Setup.serverAndDbScalaVersion, Dependencies.databaseDependencies) +lazy val database = { + val database = (projectMatrix in file("database")) + .disablePlugins(sbtassembly.AssemblyPlugin) + .settings( + Setup.commonSettings ++ Seq( + name := "atum-database", + javacOptions ++= Setup.serverAndDbJavacOptions, + publish / skip := true + ): _* + ) + .addSingleScalaBuild(Setup.serverAndDbScalaVersion, Dependencies.databaseDependencies) + if (limitedProject) { + null // if value other then null is returned, the condition doesn't seem to work. + } else { + database + } +} /** * Module `reader` is the library to be plugged into application which wants to easily read the measured data stored on diff --git a/project/Setup.scala b/project/Setup.scala index 14c3f8927..319a06bcd 100644 --- a/project/Setup.scala +++ b/project/Setup.scala @@ -24,6 +24,11 @@ import za.co.absa.commons.version.Version object Setup { + //supported Java versions + val requiredJava: Double = "1.8".toDouble + val recommendedJava: Double = "11".toDouble + val currentJava: Double = sys.props("java.specification.version").toDouble + //supported Scala versions val scala211: Version = Version.asSemVer("2.11.12") val scala212: Version = Version.asSemVer("2.12.18")