diff --git a/.github/workflows/generate_and_check_licenses.py b/.github/workflows/generate_and_check_licenses.py deleted file mode 100644 index f5478cf..0000000 --- a/.github/workflows/generate_and_check_licenses.py +++ /dev/null @@ -1,110 +0,0 @@ -""" -generate_and_check_licenses.py -Utility script for creating the license/LICENSE-3RD-PARTY.md file and reporting if there are any disallowed licenses. -Throws an exception if a disallowed license is found and creates an additional .md file with the disallowed licenses listed. -""" - -from io import StringIO -import sys -import os -import pandas as pd - -# We can define a subset of licenses we will permit. -ALLOWED_LICENSES = [ - "Apache-2.0", - "MIT", - "ISC", - "BSD-1-Clause", - "BSD-2-Clause", - "BSD-3-Clause", # Note that we have not allowed for 4-clause. - "MPL-2.0", -] - -buff = StringIO("") - -for l in sys.stdin: - buff.write(l) - -buff.seek(0) - -# Read the output into a CSV. -df = pd.read_csv(buff, sep=",", header=None, names=["Module", "License Path", "License"]) - -# Read the go.mod file to get version and indirect information -go_mod_file = 'go.mod' -with open(go_mod_file, 'r') as f: - go_mod_content = f.readlines() - -# Parse go.mod file to extract versions and indirect status -module_versions = {} -for line in go_mod_content: - line = line.strip() - - if line: - parts = line.split() - if parts[0] == "module" or parts[0] == "go" or parts[0] == ")": - continue - if parts[0] == "require": - if parts[1] == "(": - continue - else: - module_name = parts[1] - module_version = parts[2] - else: - module_name = parts[0] - module_version = parts[1] - - if "// indirect" in line: - module_version += " (indirect)" - module_versions[module_name] = module_version - - -# Add version and indirect status to the dataframe -df["Module Version"] = df["Module"].map(module_versions) - -# Combine module names with versions, excluding 'nan' -df["Module"] = df.apply( - lambda x: f'{x["Module"]} {x["Module Version"]}' if pd.notna(x["Module Version"]) else x["Module"], - axis=1 -) - -# Filter out any Dataphos-repository licenses -df = df[~df["Module"].str.startswith("github.com/dataphos")] - -if df.empty: - print("No third party licenses found.") - sys.exit() - -# Keep only the rows with compliant licenses. -df_allowed = df[df["License"].str.contains("|".join(ALLOWED_LICENSES))] - -# Create another dataframe for disallowed licenses. -df_disallowed = df[~df["License"].str.contains("|".join(ALLOWED_LICENSES))] - -# Drop the "License Path" and "Module Version" columns from both dataframes -df_allowed = df_allowed.drop(columns=["License Path", "Module Version"]) -df_disallowed = df_disallowed.drop(columns=["License Path", "Module Version"]) - -# Convert the allowed dataframe to markdown -allowed_markdown = df_allowed.to_markdown(index=False) - -# Ensure the output directory exists -output_dir = "licenses" -os.makedirs(output_dir, exist_ok=True) - -# Write the allowed licenses markdown to a .md file -allowed_output_file = os.path.join(output_dir, "LICENSE-3RD-PARTY.md") -with open(allowed_output_file, 'w') as f: - f.write(allowed_markdown) - -# Display the result. -# If the dataframe isn't empty, the check fails. -if len(df_disallowed.index) > 0: - disallowed_markdown = df_disallowed.to_markdown(index=False) - disallowed_output_file = os.path.join(output_dir, "disallowed_licenses.md") - with open(disallowed_output_file, 'w') as f: - f.write(disallowed_markdown) - print(df[["Module", "License"]]) - raise Exception("Found one or more disallowed licenses! Please review dependencies!") - -print("No worrisome dependencies found.") diff --git a/.github/workflows/libs/pr.yaml b/.github/workflows/libs/pr.yaml deleted file mode 100644 index 17e8474..0000000 --- a/.github/workflows/libs/pr.yaml +++ /dev/null @@ -1,198 +0,0 @@ -name: PR CI - -on: - pull_request: - branches: [ develop, main ] - -env: - GO111MODULE: on - GO_VERSION: 1.19 - NODE_VERSION: 22 - LINT_ARGS: -v --timeout 5m0s --out-${NO_FUTURE}format colored-line-number - TEST_ARGS: -v -short -coverprofile=coverage.out - TEST_PATH: ./... - -jobs: - commitlint: - name: Commit Lint Job - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Set up Node.js - uses: actions/setup-node@v4 - with: - node-version: ${{ env.NODE_VERSION }} - - - name: Install commitlint - run: | - npm install --save-dev @commitlint/{cli,config-conventional} - - - name: Validate PR commits with commitlint - run: npx commitlint --from ${{ github.event.pull_request.head.sha }}~${{ github.event.pull_request.commits }} --to ${{ github.event.pull_request.head.sha }} --verbose - - editor_config_job: - name: Editor Config Job - runs-on: ubuntu-latest - steps: - - name: Check out repository - uses: actions/checkout@v4 - - - name: Set up Node.js - uses: actions/setup-node@v4 - with: - node-version: ${{ env.NODE_VERSION }} - - - name: Editor Config - run: | - npm install --save-dev editorconfig-checker - ./node_modules/.bin/editorconfig-checker - - lint_job: - name: Go Lint Job - if: ${{ ! contains(github.head_ref, 'release-please--branches--main') }} - runs-on: ubuntu-latest - steps: - - name: Check out code into the Go module directory - uses: actions/checkout@v4 - - - name: Set up Go - uses: actions/setup-go@v3 - with: - go-version: ${{ env.GO_VERSION }} - - - name: Tidy - run: go mod tidy - - - name: Go Lint - uses: golangci/golangci-lint-action@v3 - with: - version: v1.50.0 - args: ${{ env.LINT_ARGS }} - skip-pkg-cache: true - skip-build-cache: true - - licenses_check: - name: 3rd Party Licenses Check - if: ${{ github.event.head_commit.committer.name != 'github-actions[bot]' || ! contains(github.head_ref, 'release-please--branches--main') }} - runs-on: ubuntu-latest - permissions: - contents: write - pull-requests: write - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Retrieve PR and branch info - run: | - PR_TITLE="chore: update 3rd-party licenses (#${{ github.event.number }})" - - PR_INFO=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ - "https://api.github.com/repos/${{ github.repository }}/pulls?state=open" | \ - jq --arg TITLE "$PR_TITLE" '.[] | select(.title == $TITLE) | { number: .number, head: .head.ref }') - - echo "PR_INFO=$PR_INFO" - - PR_NUMBER=$(echo "$PR_INFO" | jq -r .number) - BRANCH_NAME=$(echo "$PR_INFO" | jq -r .head) - - echo "PR_TITLE=$PR_TITLE" >> $GITHUB_ENV - echo "PR_NUMBER=$PR_NUMBER" >> $GITHUB_ENV - echo "BRANCH_NAME=${BRANCH_NAME:-update-third-party-licenses-${{ github.run_id }}}" >> $GITHUB_ENV - echo "PARENT_BRANCH=${{ github.head_ref }}" >> $GITHUB_ENV - - git config user.name "github-actions[bot]" - git config user.email "github-actions[bot]@users.noreply.github.com" - - # if PR already exists - - name: Pull latest changes to existing branch - if: env.PR_NUMBER != '' - run: | - git fetch origin - git switch ${{ env.BRANCH_NAME }} - git pull origin ${{ env.PARENT_BRANCH }} --no-rebase - - - name: Set up Go - uses: actions/setup-go@v4 - with: - go-version: ${{ env.GO_VERSION }} - - - name: Tidy - run: go mod tidy - - - name: Vendor - run: go mod vendor - - - name: Install Go licenses - run: go install github.com/google/go-licenses@v1.4.0 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.9' - cache: 'pip' - cache-dependency-path: '.github/workflows/requirements.txt' - - - name: Install Python dependencies - run: | - python -m pip install --upgrade pip - pip install -r .github/workflows/requirements.txt - - - name: Run license check - run: | - go-licenses report ./... 2>/dev/null | python .github/workflows/generate_and_check_licenses.py - - - name: Check and Commit changes - run: | - if [ -d "./licenses" ]; then - git add ./licenses - fi - - if ! git diff-index --quiet HEAD; then - git commit -m "chore: update third party licenses" - echo "changes_committed=true" >> $GITHUB_ENV - else - echo "changes_committed=false" >> $GITHUB_ENV - fi - - # This will fail if the incorrect go.mod or go.sum is committed - - name: Push changes - if: env.changes_committed == 'true' - run: | - git diff - - if [[ -z "$PR_NUMBER" ]]; then - git switch -c ${{ env.BRANCH_NAME }} - fi - git push origin HEAD - - - name: Create new PR - if: env.changes_committed == 'true' && env.PR_NUMBER == '' - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - echo "Creating a new PR..." - gh pr create --base "${{ env.PARENT_BRANCH }}" --head "update-third-party-licenses-${{ github.run_id }}" --title "${{ env.PR_TITLE }}" --body "This is an automated PR that updates the list of 3rd party licenses." - - test_job: - name: Test Job - if: ${{ github.base_ref == 'main' && ! contains(github.head_ref, 'release-please--branches--main') }} - runs-on: ubuntu-latest - steps: - - name: Check out code into the Go module directory - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Set up Go - uses: actions/setup-go@v3 - with: - go-version: ${{ env.GO_VERSION }} - - - name: Tidy - run: go mod tidy - - - name: Go Test - run: go test ${{ env.TEST_ARGS }} ${{ env.TEST_PATH }} diff --git a/.github/workflows/libs/push.yaml b/.github/workflows/libs/push.yaml deleted file mode 100644 index 6c8f3a4..0000000 --- a/.github/workflows/libs/push.yaml +++ /dev/null @@ -1,90 +0,0 @@ -name: PUSH CI - -on: - push: - branches: [ develop, main ] - -env: - GO_VERSION: 1.21 - -jobs: - commitlint: - name: Commit Lint Job - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Set up Node.js - uses: actions/setup-node@v4 - with: - node-version: '22' - - - name: Install commitlint - run: | - npm install --save-dev @commitlint/{cli,config-conventional} - - - name: Validate current commit (last commit) with commitlint - run: npx commitlint --last --verbose - - license_headers: - name: Add License Headers - if: github.event.head_commit.committer.name != 'github-actions[bot]' - runs-on: ubuntu-latest - permissions: - contents: write - pull-requests: write - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Set up Go environment - uses: actions/setup-go@v4 - with: - go-version: ${{ env.GO_VERSION }} - - - name: Install NWA tool - run: go install github.com/B1NARY-GR0UP/nwa@latest - - - name: Add missing license headers - run: nwa add -c "Syntio Ltd." # WRITE FOLDER PATHS FOR ALL FOLDERS THAT CONTAIN FILES THAT REQUIRE HEADERS eg. ./persistor - - - name: Check and Commit changes - id: check_commit - run: | - git config user.name "github-actions[bot]" - git config user.email "github-actions[bot]@users.noreply.github.com" - git add . - if ! git diff-index --quiet HEAD; then - git commit -m "style: add license headers" - echo "changes_committed=true" >> $GITHUB_ENV - else - echo "changes_committed=false" >> $GITHUB_ENV - echo "All necessary headers present." - fi - - - name: Create a new branch for the PR - if: env.changes_committed == 'true' - run: | - git checkout -b "add-license-headers-${{ github.run_id }}" - git push origin HEAD - - - name: Create Pull Request - if: env.changes_committed == 'true' - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - gh pr create --base ${{ github.ref_name }} --head "add-license-headers-${{ github.run_id }}" --title "style: add license headers" --body "This PR adds license headers to the affected files. Recommendation: Merge this PR using the rebase-merge method" - - release-please: - if: github.ref_name == 'main' - runs-on: ubuntu-latest - permissions: - contents: write - pull-requests: write - steps: - - uses: googleapis/release-please-action@v4 - with: - token: ${{ secrets.RELEASE_PLEASE_TOKEN }} - release-type: simple diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml new file mode 100644 index 0000000..01209d1 --- /dev/null +++ b/.github/workflows/pr.yaml @@ -0,0 +1,45 @@ +name: PR CI + +on: + pull_request: + branches: [ develop, main ] + +env: + NODE_VERSION: 22 + +jobs: + commitlint: + name: Commit Lint Job + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: ${{ env.NODE_VERSION }} + + - name: Install commitlint + run: | + npm install --save-dev @commitlint/{cli,config-conventional} + - name: Validate PR commits with commitlint + run: npx commitlint --from ${{ github.event.pull_request.head.sha }}~${{ github.event.pull_request.commits }} --to ${{ github.event.pull_request.head.sha }} --verbose + + editor_config_job: + name: Editor Config Job + runs-on: ubuntu-latest + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: ${{ env.NODE_VERSION }} + + - name: Editor Config + run: | + npm install --save-dev editorconfig-checker + ./node_modules/.bin/editorconfig-checker \ No newline at end of file diff --git a/.github/workflows/products/pr.yaml b/.github/workflows/products/pr.yaml deleted file mode 100644 index 7b439b5..0000000 --- a/.github/workflows/products/pr.yaml +++ /dev/null @@ -1,285 +0,0 @@ -name: PR CI - -on: - pull_request: - branches: [ develop, main ] - -env: - GO111MODULE: on - GO_VERSION: 1.19 - NODE_VERSION: 22 - LINT_ARGS: -v --skip-files .*_test.go --timeout 5m0s --out-${NO_FUTURE}format colored-line-number - GOLANGCI_LINT_VERSION: v1.50 - TEST_ARGS: -v -short -coverprofile=coverage.out - -jobs: - commitlint: - name: Commit Lint Job - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Set up Node.js - uses: actions/setup-node@v4 - with: - node-version: ${{ env.NODE_VERSION }} - - - name: Install commitlint - run: | - npm install --save-dev @commitlint/{cli,config-conventional} - - - name: Validate PR commits with commitlint - run: npx commitlint --from ${{ github.event.pull_request.head.sha }}~${{ github.event.pull_request.commits }} --to ${{ github.event.pull_request.head.sha }} --verbose - - # Linting multiple Dockerfiles to ensure adherence to best practices and coding standards. - hadolint_job: - name: Hadolint Job - if: ${{ ! contains(github.head_ref, 'release-please--branches--main') }} - runs-on: ubuntu-latest - strategy: - matrix: - dockerfile: # Add dockerfile paths eg. './persistor/docker/persistor/Dockerfile' - - - - steps: - - name: Check out code - uses: actions/checkout@v4 - - - name: Run Hadolint for ${{ matrix.dockerfile }} - uses: hadolint/hadolint-action@v2.0.0 - with: - config: ./.hadolint.yaml - dockerfile: ${{ matrix.dockerfile }} - - editor_config_job: - name: Editor Config Job - runs-on: ubuntu-latest - steps: - - name: Check out repository - uses: actions/checkout@v4 - - - name: Set up Node.js - uses: actions/setup-node@v4 - with: - node-version: ${{ env.NODE_VERSION }} - - - name: Editor Config - run: | - npm install --save-dev editorconfig-checker - ./node_modules/.bin/editorconfig-checker - - # Ensures that java and python code adhere to coding styles and conventions - java_and_python_lint_job: - name: Java and Python lint - if: ${{ ! contains(github.head_ref, 'release-please--branches--main') }} - uses: github/super-linter@v4 - env: - GITHUB_TOKEN: ${{ secrets.GH_TOKEN }} - VALIDATE_PYTHON_BLACK: true - VALIDATE_JAVA: true - - # Ensures that the code adheres to the lint checks defined in .golangci.yaml. - lint_job: - name: Go lint job for all components - if: ${{ ! contains(github.head_ref, 'release-please--branches--main') }} - runs-on: ubuntu-latest - strategy: - matrix: - component: - - # Add file paths eg. './persistor - steps: - - name: Check out code into the Go module directory - uses: actions/checkout@v4 - - # Using `tj-actions/changed-files` to check if any files in the component folder have changes - - name: Check if component folder has changed - id: check_changed_files - uses: tj-actions/changed-files@v45 - with: - files: ${{ matrix.component }}/** - - # Running lint steps if changes are detected in the component folder - - name: Set up Go - if: steps.check_changed_files.outputs.any_changed == 'true' - uses: actions/setup-go@v3 - with: - go-version: ${{ env.GO_VERSION }} - # Add all component folders for monorepos - cache-dependency-path: | - /go.sum - - - name: Tidy Go mod for ${{ matrix.component }} - if: steps.check_changed_files.outputs.any_changed == 'true' - working-directory: ${{ matrix.component }} - run: go mod tidy - - - name: Run Go Lint for ${{ matrix.component }} - if: steps.check_changed_files.outputs.any_changed == 'true' - uses: golangci/golangci-lint-action@v3 - with: - version: v1.50.0 - args: ${{ env.LINT_ARGS }} - skip-pkg-cache: true - skip-build-cache: true - working-directory: ${{ matrix.component }} - - licenses_check: - name: 3rd Party Licenses Check - if: ${{ github.event.head_commit.committer.name != 'github-actions[bot]' || ! contains(github.head_ref, 'release-please--branches--main') }} - runs-on: ubuntu-latest - permissions: - contents: write - pull-requests: write - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Retrieve PR and branch info - run: | - PR_TITLE="chore: update 3rd-party licenses (#${{ github.event.number }})" - - PR_INFO=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ - "https://api.github.com/repos/${{ github.repository }}/pulls?state=open" | \ - jq --arg TITLE "$PR_TITLE" '.[] | select(.title == $TITLE) | { number: .number, head: .head.ref }') - - echo "PR_INFO=$PR_INFO" - - PR_NUMBER=$(echo "$PR_INFO" | jq -r .number) - BRANCH_NAME=$(echo "$PR_INFO" | jq -r .head) - - echo "PR_TITLE=$PR_TITLE" >> $GITHUB_ENV - echo "PR_NUMBER=$PR_NUMBER" >> $GITHUB_ENV - echo "BRANCH_NAME=${BRANCH_NAME:-update-third-party-licenses-${{ github.run_id }}}" >> $GITHUB_ENV - echo "PARENT_BRANCH=${{ github.head_ref }}" >> $GITHUB_ENV - - git config user.name "github-actions[bot]" - git config user.email "github-actions[bot]@users.noreply.github.com" - - # if PR already exists - - name: Pull latest changes to existing branch - if: env.PR_NUMBER != '' - run: | - git fetch origin - git switch ${{ env.BRANCH_NAME }} - git pull origin ${{ env.PARENT_BRANCH }} --no-rebase - - - name: Set up Go - uses: actions/setup-go@v4 - with: - go-version: ${{ env.GO_VERSION }} - # Add all component folders for monorepos - cache-dependency-path: | - /go.sum - - - name: Install Go licenses - run: go install github.com/google/go-licenses@v1.4.0 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.9' - cache: 'pip' - cache-dependency-path: '.github/workflows/requirements.txt' - - - name: Install Python dependencies - run: | - python -m pip install --upgrade pip - pip install -r .github/workflows/requirements.txt - - - name: Run go mod tidy, go mod vendor & license check - # switch to each component folder first e.g. "./persistor" - run: | - cd - go mod tidy - go mod vendor - go-licenses report ./... 2>/dev/null | python .github/workflows/generate_and_check_licenses.py - cd ../ - - - name: Check and Commit changes - # add licenses for each component - run: | - git add ./persistor/licenses ./indexer-api/licenses ./resubmitter-api/licenses - - if ! git diff-index --quiet HEAD; then - git commit -m "chore: update third party licenses" - echo "changes_committed=true" >> $GITHUB_ENV - else - echo "changes_committed=false" >> $GITHUB_ENV - fi - - # This will fail if the incorrect go.mod or go.sum is committed - - name: Push changes - if: env.changes_committed == 'true' - run: | - git diff - - if [[ -z "$PR_NUMBER" ]]; then - git switch -c ${{ env.BRANCH_NAME }} - fi - git push origin HEAD - - - name: Create new PR - if: env.changes_committed == 'true' && env.PR_NUMBER == '' - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - echo "Creating a new PR..." - gh pr create --base "${{ env.PARENT_BRANCH }}" --head "update-third-party-licenses-${{ github.run_id }}" --title "${{ env.PR_TITLE }}" --body "This is an automated PR that updates the list of 3rd party licenses." - - # Runs unit tests for all components in this repo - test_job: - name: Test Job for all components - if: ${{ github.base_ref == 'main' && ! contains(github.head_ref, 'release-please--branches--main') }} - runs-on: ubuntu-latest - - strategy: - matrix: - component: - - # Add file paths eg. './persistor - - steps: - - name: Check out code into the Go module directory - uses: actions/checkout@v4 - - - name: Set up Go - uses: actions/setup-go@v3 - with: - go-version: ${{ env.GO_VERSION }} - # Add all component folders for monorepos - cache-dependency-path: | - /go.sum - - - name: Tidy Go mod for ${{ matrix.component }} - working-directory: ${{ matrix.component }} - run: go mod tidy - - - name: Run Go Test for ${{ matrix.component }} - working-directory: ${{ matrix.component }} - run: go test ${{env.TEST_ARGS}} ./... - - # Builds docker images for all components of the repo to test if they can successfully be built - Build_docker: - name: Test building docker images - if: ${{ ! contains(github.head_ref, 'release-please--branches--main') }} - runs-on: ubuntu-latest - - strategy: - matrix: - component: - - dockerfile-path: # Add dockerfile path eg. './persistor/docker/persistor/Dockerfile' - image-name: # Add image name eg. 'persistor-core' - - steps: - - name: Check out code - uses: actions/checkout@v4 - - - name: Set Tag - run: | - TAG="build-docker-test" - echo "TAG=$TAG" >> $GITHUB_ENV # Exporting the TAG variable to the environment - - - name: Build Docker image - run: | - docker build -t ${{ matrix.component.image-name }}:${{ env.TAG }} -f ${{ matrix.component.dockerfile-path }} . diff --git a/.github/workflows/products/push.yaml b/.github/workflows/products/push.yaml deleted file mode 100644 index b3e3e5f..0000000 --- a/.github/workflows/products/push.yaml +++ /dev/null @@ -1,148 +0,0 @@ -name: PUSH CI - -on: - push: - branches: [ develop, main ] - -env: - GO_VERSION: 1.21 - -jobs: - commitlint: - name: Commit Lint Job - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Set up Node.js - uses: actions/setup-node@v4 - with: - node-version: '22' - - - name: Install commitlint - run: | - npm install --save-dev @commitlint/{cli,config-conventional} - - - name: Validate current commit (last commit) with commitlint - run: npx commitlint --last --verbose - - license_headers: - name: Add License Headers - if: github.event.head_commit.committer.name != 'github-actions[bot]' - runs-on: ubuntu-latest - permissions: - contents: write - pull-requests: write - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Set up Go environment - uses: actions/setup-go@v4 - with: - go-version: ${{ env.GO_VERSION }} - - - name: Install NWA tool - run: go install github.com/B1NARY-GR0UP/nwa@latest - - - name: Add missing license headers - run: nwa add -c "Syntio Ltd." # WRITE FOLDER PATHS FOR ALL FOLDERS THAT CONTAIN FILES THAT REQUIRE HEADERS eg. ./persistor - - - name: Check and Commit changes - id: check_commit - run: | - git config user.name "github-actions[bot]" - git config user.email "github-actions[bot]@users.noreply.github.com" - git add . - if ! git diff-index --quiet HEAD; then - git commit -m "style: add license headers" - echo "changes_committed=true" >> $GITHUB_ENV - else - echo "changes_committed=false" >> $GITHUB_ENV - echo "All necessary headers present." - fi - - - name: Create a new branch for the PR - if: env.changes_committed == 'true' - run: | - git checkout -b "add-license-headers-${{ github.run_id }}" - git push origin HEAD - - - name: Create Pull Request - if: env.changes_committed == 'true' - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - gh pr create --base ${{ github.ref_name }} --head "add-license-headers-${{ github.run_id }}" --title "style: add license headers" --body "This PR adds license headers to the affected files. Recommendation: Merge this PR using the rebase-merge method" - - Upload_docker: - name: Build, Push and Sign Docker Images - if: github.ref_name == 'main' - runs-on: ubuntu-latest - permissions: - id-token: write # required to generate JWT token - - strategy: - matrix: - component: - - dockerfile-path: # Add dockerfile path eg. './persistor/docker/persistor/Dockerfile' - image-name: # Add image name eg. 'persistor-core' - - steps: - - name: Check out code - uses: actions/checkout@v4 - - - name: Set Tag - run: | - TAG=$(cat version.txt) - echo "TAG=$TAG" >> $GITHUB_ENV - - - name: Docker Hub Login - run: | - echo "${{ secrets.DOCKERHUB_PASSWORD }}" | docker login --username "${{ secrets.DOCKERHUB_USERNAME }}" --password-stdin - - - name: Build Docker image - run: | - docker build -t ${{ matrix.component.image-name }}:${{ env.TAG }} -f ${{ matrix.component.dockerfile-path }} . - - - name: Check if Docker image tag exists - run: | - if docker manifest inspect syntioinc/dataphos-${{ matrix.component.image-name }}:${{ env.TAG }} > /dev/null 2>&1; then - echo "TAG_EXISTS=true" >> $GITHUB_ENV - else - echo "TAG_EXISTS=false" >> $GITHUB_ENV - fi - - - name: Tag and Push Docker image - if: ${{ env.TAG_EXISTS }} == 'false' - run: | - docker tag ${{ matrix.component.image-name }}:${{ env.TAG }} syntioinc/dataphos-${{ matrix.component.image-name }}:${{ env.TAG }} - docker push syntioinc/dataphos-${{ matrix.component.image-name }}:${{ env.TAG }} - - - name: Install cosign - if: ${{ env.TAG_EXISTS }} == 'false' - uses: sigstore/cosign-installer@v3.6.0 - - - name: Sign the Docker image - if: ${{ env.TAG_EXISTS }} == 'false' - run: | - digest=$(docker inspect --format='{{index .RepoDigests 0}}' syntioinc/dataphos-${{ matrix.component.image-name }}:${{ env.TAG }}) - cosign sign --yes "$digest" - - - name: Image already exists - if: ${{ env.TAG_EXISTS }} == 'true' - run: echo "Docker image syntioinc/dataphos-${{ matrix.component.image-name }}:${{ env.TAG }} already exists. Skipping push." - - release-please: - if: github.ref_name == 'main' - runs-on: ubuntu-latest - permissions: - contents: write - pull-requests: write - steps: - - uses: googleapis/release-please-action@v4 - with: - token: ${{ secrets.RELEASE_PLEASE_TOKEN }} - release-type: simple diff --git a/.github/workflows/push.yaml b/.github/workflows/push.yaml new file mode 100644 index 0000000..7353a8a --- /dev/null +++ b/.github/workflows/push.yaml @@ -0,0 +1,69 @@ +name: PUSH CI +on: + push: + branches: [ develop, main ] + workflow_dispatch: + +jobs: + commitlint: + name: Commit Lint Job + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: '22' + + - name: Install commitlint + run: | + npm install --save-dev @commitlint/{cli,config-conventional} + - name: Validate current commit (last commit) with commitlint + run: npx commitlint --last --verbose + + deploy: + if: github.ref_name == 'main' + runs-on: ubuntu-latest + permissions: + contents: read + id-token: write + steps: + - name: Checkout Code + uses: 'actions/checkout@v3' + with: + submodules: 'true' + + - name: Authenticate to Google Cloud + uses: 'google-github-actions/auth@v1' + with: + workload_identity_provider: "${{ secrets.WIF_PROVIDER }}" + service_account: "${{ secrets.WIF_ACCOUNT }}" + + - id: 'deploy' + uses: 'google-github-actions/deploy-cloudrun@v1' + with: + service: 'dataphos-docs' + source: ./dataphos-docs + region: europe-west1 + project_id: ${{ secrets.GCP_PROJECT_ID }} + flags: '--port=1313 --max-instances=5 --memory=512Mi --timeout=300 --allow-unauthenticated' + env_vars: | + HUGO_ENV=production + + - name: 'Use output' + run: 'curl "${{ steps.deploy.outputs.url }}"' + + release-please: + if: github.ref_name == 'main' + runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write + steps: + - uses: googleapis/release-please-action@v4 + with: + token: ${{ secrets.RELEASE_PLEASE_TOKEN }} + release-type: simple diff --git a/.github/workflows/requirements.txt b/.github/workflows/requirements.txt deleted file mode 100644 index ec752b4..0000000 --- a/.github/workflows/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -pandas -tabulate diff --git a/.gitignore b/.gitignore index 3e44a89..b512c09 100644 --- a/.gitignore +++ b/.gitignore @@ -1,24 +1 @@ -# Custom added files -.idea -.vscode -.pre-commit-config.yaml -git-conventional-commits.yaml - -# In case Act was used for local testing -.secrets - -# Binaries for programs and plugins -*.exe -*.exe~ -*.dll -*.so -*.dylib - -# Test binary, built with `go test -c` -*.test - -# Output of the go coverage tool, specifically when used with LiteIDE -*.out - -# Dependency directories (remove the comment below to include it) -# vendor/ +node_modules \ No newline at end of file diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..0945a26 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "dataphos-docs/themes/hugo-geekdoc"] + path = dataphos-docs/themes/hugo-geekdoc + url = https://github.com/thegeeklab/hugo-geekdoc.git diff --git a/.golangci.yaml b/.golangci.yaml deleted file mode 100644 index 7e64123..0000000 --- a/.golangci.yaml +++ /dev/null @@ -1,1051 +0,0 @@ -# This file contains all available configuration options -# with their default values. -# options for analysis running -run: - # The default concurrency value is the number of available CPU. - concurrency: 4 - # Timeout for analysis, e.g. 30s, 5m. - # Default: 1m - timeout: 5m - # Exit code when at least one issue was found. - # Default: 1 - issues-exit-code: 1 - # Include test files or not. - # Default: true - tests: true - # List of build tags, all linters use it. - # Default: []. - build-tags: - - mytag - # Which dirs to skip: issues from them won't be reported. - # Can use regexp here: `generated.*`, regexp is applied on full path. - # Default value is empty list, - # but default dirs are skipped independently of this option's value (see skip-dirs-use-default). - # "/" will be replaced by current OS file path separator to properly work on Windows. - skip-dirs: [ ] - # Enables skipping of directories: - # - vendor$, third_party$, testdata$, examples$, Godeps$, builtin$ - # Default: true - skip-dirs-use-default: true - # Which files to skip: they will be analyzed, but issues from them won't be reported. - # Default value is empty list, - # but there is no need to include all autogenerated files, - # we confidently recognize autogenerated files. - # If it's not please let us know. - # "/" will be replaced by current OS file path separator to properly work on Windows. - skip-files: [ ] - # If set we pass it to "go list -mod={option}". From "go help modules": - # If invoked with -mod=readonly, the go command is disallowed from the implicit - # automatic updating of go.mod described above. Instead, it fails when any changes - # to go.mod are needed. This setting is most useful to check that go.mod does - # not need updates, such as in a continuous integration and testing system. - # If invoked with -mod=vendor, the go command assumes that the vendor - # directory holds the correct copies of dependencies and ignores - # the dependency descriptions in go.mod. - # - # Allowed values: readonly|vendor|mod - # By default, it isn't set. - # modules-download-mode: - # Allow multiple parallel golangci-lint instances running. - # If false (default) - golangci-lint acquires file lock on start. - allow-parallel-runners: false - # Define the Go version limit. - # Mainly related to generics support since go1.18. - # Default: use Go version from the go.mod file, fallback on the env var `GOVERSION`, fallback on 1.18 - # go: 'GOVERSION' -# output configuration options - -output: - # colored-line-number|line-number|json|tab|checkstyle|code-climate, default is "colored-line-number" - format: colored-line-number - # print lines of code with issue, default is true - print-issued-lines: true - # print linter name in the end of issue text, default is true - print-linter-name: true - # make issues output unique by line, default is true - uniq-by-line: true - # add a prefix to the output file references; default is no prefix - path-prefix: "" - # Sort results by: filepath, line and column. - sort-results: false -# all available settings of specific linters - -linters: - disable-all: true - enable: - - asasalint - - asciicheck - - bidichk - - bodyclose - - containedctx - - contextcheck - # - cyclop - - decorder - - depguard - - dogsled - # - dupl - - durationcheck - - errchkjson - - errcheck - - errname - - errorlint - - execinquery - - exhaustive - # - exhaustruct - - exportloopref - - forcetypeassert - # - funlen - - gosimple - - govet - - gci - # - gochecknoglobals - - gochecknoinits - # - gocognit - - goconst - # - gocritic - # - gocyclo - - godot - - godox - - goerr113 - - gofumpt - - goheader - # - gomnd - - gomoddirectives - - gomodguard - - goprintffuncname - - gosec - - grouper - - ineffassign - - importas - # - ireturn - # - lll - # - maintidx - - makezero - - misspell - - nakedret - # - nestif - - nilerr - - nilnil - - nlreturn - - noctx - # - nolintlint - - nonamedreturns - - nosprintfhostport - # - paralleltest - - prealloc - - predeclared - - promlinter - # - revive - - rowserrcheck - - sqlclosecheck - - stylecheck - - staticcheck - - tagliatelle - - tenv - - testpackage - - thelper - - tparallel - - typecheck - - unconvert - - unparam - - unused - - varnamelen - - wastedassign - - whitespace - # - wrapcheck - - wsl - presets: - - bugs - - unused - # Run only fast linters from enabled linters set (first run won't be fast) - # Default: false - fast: false - -linters-settings: - asasalint: - # to specify a set of function names to exclude - # the values are merged with the builtin exclusions - # the builtin exclusions can be disabled by setting `use-builtin-exclusions` to `false` - # default: ["^(fmt|log|logger|t|)\.(Print|Fprint|Sprint|Fatal|Panic|Error|Warn|Warning|Info|Debug|Log)(|f|ln)$"] - exclude: - - Append - - \.Wrapf - # to enable/disable the asasalint builtin exclusions of function names - # see the default value of `exclude` to get the builtin exclusions, true by default - use-builtin-exclusions: true - # ignore *_test.go files, false by default - ignore-test: true - bidichk: - # the following configurations check for all mentioned invisible unicode runes. - # all runes are enabled by default. - left-to-right-embedding: true - right-to-left-embedding: true - pop-directional-formatting: true - left-to-right-override: true - right-to-left-override: true - left-to-right-isolate: true - right-to-left-isolate: true - first-strong-isolate: true - pop-directional-isolate: true - cyclop: - # the maximal code complexity to report, 10 by default - max-complexity: 10 - # the maximal average package complexity, 0.0 by default - # if it's higher than 0.0 (float) the check is enabled - package-average: 0.0 - # should ignore tests, false by default - skip-tests: true - decorder: - # required order of `type`, `const`, `var` and `func` declarations inside a file - # default: types before constants before variables before functions - dec-order: - - type - - const - - var - - func - # if true, order of declarations is not checked at all, true (disabled) by default - disable-dec-order-check: true - # if true, `init` func can be anywhere in file (does not have to be declared before all other functions), true (disabled) by default - disable-init-func-first-check: false - # if true, multiple global `type`, `const` and `var` declarations are allowed, true (disabled) by default - disable-dec-num-check: true - depguard: - # kind of list is passed in - # allowed values: allowlist|denylist, default: denylist - list-type: blacklist - # check the list against standard lib, false by default - include-go-root: false - # a list of packages for the list type specified - # can accept both string prefixes and string glob patterns - # default: [] - packages: [ ] - # a list of packages for the list type specifyed - # specify an error message to output when a denied package is used - # default: [] - packages-with-error-message: [ ] - # specify rules by which the linter ignores certain files for consideration - # can accept both string prefixes and string glob patterns - # the ! character in front of the rule is a special character - # which signals that the linter should negate the rule - # this allows for more precise control, but it is only available for glob patterns - # default: [] - ignore-file-rules: [ ] - # create additional guards that follow the same configuration pattern - # results from all guards are aggregated together - additional-guards: [ ] - # for example - # - list-type: denylist - # include-go-root: false - # packages: - # - github.com/stretchr/testify - # specify rules by which the linter ignores certain files for consideration - # ignore-file-rules: - # - "**/*_test.go" - # - "**/mock/**/*.go" - dogsled: - # checks assignments with too many blank identifiers; 2 by default - max-blank-identifiers: 2 - dupl: - # tokens count to trigger issue, 150 by default - threshold: 100 - errcheck: - # report about not checking of errors in type assertions: `a := b.(MyStruct)` - # false by default: such cases aren't reported by default - check-type-assertions: true - # report about assignment of errors to blank identifier: `num, _ := strconv.Atoi(numStr)` - # false by default: such cases aren't reported by default - check-blank: false - # path to a file containing a list of functions to exclude from checking - # see https://github.com/kisielk/errcheck#excluding-functions for details - disable-default-exclusions: true - # list of functions to exclude from checking, where each entry is a single function to exclude - # see https://github.com/kisielk/errcheck#excluding-functions for details - exclude-functions: [ ] - errchkjson: - # with check-error-free-encoding set to true, errchkjson does warn about errors - # from json encoding functions that are safe to be ignored, - # because they are not possible to happen - # - # if check-error-free-encoding is set to true and errcheck linter is enabled, - # it is recommended to add the following exceptions to prevent from false positives: - # - # linters-settings: - # errcheck: - # exclude-functions: - # - encoding/json.Marshal - # - encoding/json.MarshalIndent - # - # false by default - check-error-free-encoding: false - # issue on struct encoding that doesn't have exported fields, false by default - report-no-exported: false - errorlint: - # check whether fmt.Errorf uses the %w verb for formatting errors - # see the https://github.com/polyfloyd/go-errorlint for caveats, true by default - errorf: true - # check for plain type assertions and type switches, true by default - asserts: true - # check for plain error comparisons, true by default - comparison: true - exhaustive: - # check switch statements in generated files also, false by default - check-generated: false - # indicates that switch statements are to be considered exhaustive if a - # 'default' case is present, even if all enum members aren't listed in the - # switch - default-signifies-exhaustive: false - # enum members matching the supplied regex do not have to be listed in - # switch statements to satisfy exhaustiveness - # default: "" - ignore-enum-members: "" - # consider enums only in package scopes, not in inner scopes, false by default - package-scope-only: false - funlen: - # checks the number of lines in a function. - # if lower than 0, disable the check, 60 by default - lines: 100 - # checks the number of statements in a function. - # if lower than 0, disable the check, 40 by default - statements: 50 - gci: - # section configuration to compare against - # section names are case-insensitive and may contain parameters in () - # the default order of sections is `standard > default > custom > blank > dot`, - # if `custom-order` is `true`, it follows the order of `sections` option - # custom section: groups all imports with the specified Prefix - # blank section: contains all blank imports. This section is not present unless explicitly enabled - # dot section: contains all dot imports. This section is not present unless explicitly enabled - # default: ["standard", "default"] - sections: - - standard # Standard section: captures all standard packages - - default # Default section: contains all imports that could not be matched to another section type - - prefix(github.com/dataphos) - # skip generated files, true by default - skip-generated: true - # enable custom order of sections - # if `true`, make the section order the same as the order of `sections`, false by default - custom-order: false - gocognit: - # minimal code complexity to report, 30 by default (but we recommend 10-20) - min-complexity: 15 - goconst: - # minimal length of string constant, 3 by default - min-len: 3 - # minimal occurrences count to trigger, 3 by default - min-occurrences: 3 - # ignore test files, false by default - ignore-tests: true - # look for existing constants matching the values, true by default - match-constant: true - # search also for duplicated numbers, false by default - numbers: false - # minimum value, only works with goconst.numbers, 3 by default - min: 3 - # maximum value, only works with goconst.numbers, 3 by default - max: 3 - # ignore when constant is not used as function argument, true by default - ignore-calls: true - gocritic: - # which checks should be enabled; can't be combined with 'disabled-checks' - # see https://go-critic.github.io/overview#checks-overview - # to check which checks are enabled run `GL_DEBUG=gocritic golangci-lint run` - # by default, list of stable checks is used - enabled-checks: - - nestingReduce - - unnamedResult - - ruleguard - - truncateCmp - # which checks should be disabled; can't be combined with 'enabled-checks' - # default: [] - disabled-checks: [ ] - # enable multiple checks by tags, run `GL_DEBUG=gocritic golangci-lint run` to see all tags and checks - # see https://github.com/go-critic/go-critic#usage -> section "Tags" - # default: [] - enabled-tags: [ ] - disabled-tags: [ ] - # settings passed to gocritic. - # the settings key is the name of a supported gocritic checker. - # the list of supported checkers can be find in https://go-critic.github.io/overview. - settings: - # must be valid enabled check name. - nestingReduce: - # min number of statements inside a branch to trigger a warning, 5 by default - bodyWidth: 5 - # whether to check test functions, true by default - # skipTestFuncs: true - ruleguard: - # enable debug to identify which 'Where' condition was rejected - # the value of the parameter is the name of a function in a ruleguard file - # - # when a rule is evaluated: - # If: - # the Match() clause is accepted; and - # one of the conditions in the Where() clause is rejected, - # Then: - # ruleguard prints the specific Where() condition that was rejected - # - # The flag is passed to the ruleguard 'debug-group' argument - # Default: "" - debug: "" - # determines the behavior when an error occurs while parsing ruleguard files - # if flag is not set, log error and skip rule files that contain an error - # if flag is set, the value must be a comma-separated list of error conditions - # - 'all': fail on all errors - # - 'import': ruleguard rule imports a package that cannot be found - # - 'dsl': gorule file does not comply with the ruleguard DSL - # default: "" - failOn: "" - # comma-separated list of file paths containing ruleguard rules - # if a path is relative, it is relative to the directory where the golangci-lint command is executed - # the special '${configDir}' variable is substituted with the absolute directory containing the golangci config file - # glob patterns such as 'rules-*.go' may be specified - # default: "" - rules: "" - # comma-separated list of enabled groups or skip empty to enable everything - # tags can be defined with # character prefix - # default: "" - enable: "" - # comma-separated list of disabled groups or skip empty to enable everything - # tags can be defined with # character prefix - # default: "" - disable: "" - truncateCmp: - # whether to skip int/uint/uintptr types, true by deafult - skipArchDependent: true - unnamedResult: - # whether to check exported functions, false by default - checkExported: false - gocyclo: - # minimal code complexity to report, 30 by default (but we recommend 10-20) - min-complexity: 10 - godot: - # check all top-level comments, not only declarations - check-all: true - godox: - # report any comments starting with keywords, this is useful for TODO or FIXME comments that - # might be left in the code accidentally and should be resolved before merging - keywords: # default keywords are TODO, BUG, and FIXME, these can be overwritten by this setting - - NOTE - - OPTIMIZE # marks code that should be optimized before merging - - HACK # marks hack-arounds that should be removed before merging - gofmt: - # Simplify code: gofmt with `-s` option. - # Default: true - simplify: true - # Apply the rewrite rules to the source before reformatting. - # https://pkg.go.dev/cmd/gofmt - # Default: [] - rewrite-rules: [ ] - gofumpt: - # module path which contains the source code being formatted - # default: "" - module-path: "" - # choose whether to use the extra rules - # false by default - extra-rules: false - goheader: - # supports two types 'const` and `regexp` - # values can be used recursively - # default: {} - values: { } - # the template use for checking - # default: "" - template: "" - # ss alternative of directive 'template', you may put the path to file with the template source - # useful if you need to load the template from a specific file - # default: "" - template-path: "" - goimports: - # put imports beginning with prefix after 3rd-party packages - # it's a comma-separated list of prefixes - # default: "" - local-prefixes: "" - gomnd: - # list of enabled checks, see https://github.com/tommy-muehle/go-mnd/#checks for description - # default: ["argument", "case", "condition", "operation", "return", "assign"] - checks: [ argument,case,condition,operation,return,assign ] - # list of numbers to exclude from analysis - # the numbers should be written as string - # values always ignored: "1", "1.0", "0" and "0.0" - # default: [] - ignored-numbers: [ ] - # list of file patterns to exclude from analysis - # values always ignored: `.+_test.go` - # default: [] - ignored-files: [ ] - # list of function patterns to exclude from analysis - # values always ignored: `time.Date` - # default: [] - ignored-functions: [ ] - gomoddirectives: - # allow local `replace` directives. - # false by default - replace-local: false - # list of allowed `replace` directives. - # default: [] - replace-allow-list: [ ] - # allow to not explain why the version has been retracted in the `retract` directives. - # false by default - retract-allow-no-explanation: false - # forbid the use of the `exclude` directives. - # false by default - exclude-forbidden: false - gomodguard: - allowed: - # list of allowed modules - # default: [] - modules: [ ] - # list of allowed module domains - # default: [] - domains: [ ] - blocked: - # list of blocked modules - # default: [] - modules: - # blocked module - - github.com/uudashr/go-module: - # recommended modules that should be used instead. (Optional) - recommendations: - - golang.org/x/mod - # reason why the recommended module should be used. (Optional) - reason: "`mod` is the official go.mod parser library." - # list of blocked module version constraints - # default: [] - versions: [ ] - # set to true to raise lint issues for packages that are loaded from a local path via replace directive - # false by default - local_replace_directives: false - gosimple: - # https://staticcheck.io/docs/configuration/options/#checks - # default: ["*"] - checks: [ "*" ] - gosec: - # to select a subset of rules to run. - # available rules: https://github.com/securego/gosec#available-rules - # default: [] - means include all rules - includes: [ ] - # to specify a set of rules to explicitly exclude. - # available rules: https://github.com/securego/gosec#available-rules - # default: [] - excludes: [ ] - # exclude generated files - # false by default - exclude-generated: false - # filter out the issues with a lower severity than the given value - # valid options are: low, medium, high. - # low by default - severity: low - # filter out the issues with a lower confidence than the given value - # valid options are: low, medium, high. - # low by default - confidence: low - # concurrency value. - # default: the number of logical CPUs usable by the current process - concurrency: 12 - # to specify the configuration of rules - config: - # globals are applicable to all rules - global: - # if true, ignore #nosec in comments (and an alternative as well) - # false by default - nosec: false - # add an alternative comment prefix to #nosec (both will work at the same time) - # default: "" - "#nosec": "" - # define whether nosec issues are counted as finding or not - # default: false - show-ignored: false - # audit mode enables addition checks that for normal code analysis might be too nosy - # default: false - audit: false - G101: - # regexp pattern for variables and constants to find - # default: "(?i)passwd|pass|password|pwd|secret|token|pw|apiKey|bearer|cred" - pattern: "(?i)passwd|pass|password|pwd|secret|token|pw|apiKey|bearer|cred" - # if true, complain about all cases (even with low entropy) - # false by default - ignore_entropy: false - # maximum allowed entropy of the string - # "80.0" by default - entropy_threshold: "80.0" - # maximum allowed value of entropy/string length - # is taken into account if entropy >= entropy_threshold/2 - # "3.0" by default - per_char_threshold: "3.0" - # calculate entropy for first N chars of the string - # "16" by default - truncate: "16" - # additional functions to ignore while checking unhandled errors - # following functions always ignored: - # bytes.Buffer: - # - Write - # - WriteByte - # - WriteRune - # - WriteString - # fmt: - # - Print - # - Printf - # - Println - # - Fprint - # - Fprintf - # - Fprintln - # strings.Builder: - # - Write - # - WriteByte - # - WriteRune - # - WriteString - # io.PipeWriter: - # - CloseWithError - # hash.Hash: - # - Write - # os: - # - Unsetenv - # default: {} - G104: { } - G111: - # Regexp pattern to find potential directory traversal. - # Default: "http\\.Dir\\(\"\\/\"\\)|http\\.Dir\\('\\/'\\)" - pattern: "http\\.Dir\\(\"\\/\"\\)|http\\.Dir\\('\\/'\\)" - # maximum allowed permissions mode for os.Mkdir and os.MkdirAll - # "0750" by default - G301: "0750" - # maximum allowed permissions mode for os.OpenFile and os.Chmod - # "0600" by default - G302: "0600" - # maximum allowed permissions mode for os.WriteFile and ioutil.WriteFile - # "0600" by default - G306: "0600" - govet: - # report about shadowed variables. - # false by default - check-shadowing: false - # settings per analyzer. - # settings: - # analyzer name, run `go tool vet help` to see all analyzers - # printf: - # comma-separated list of print function names to check (in addition to default, see `go tool vet help printf`) - # default: [] - # funcs: [] - # shadow: - # Whether to be strict about shadowing; can be noisy - # false by default - # strict: false - # unusedresult: - # comma-separated list of functions whose results must be used - # (in addition to defaults context.WithCancel,context.WithDeadline,context.WithTimeout,context.WithValue, - # errors.New,fmt.Errorf,fmt.Sprint,fmt.Sprintf,sort.Reverse) - # default: [] - # funcs: [] - # comma-separated list of names of methods of type func() string whose results must be used - # (in addition to default Error,String) - # default: [] - # stringmethods: [] - # disable all analyzers - # false by default - disable-all: true - # enable analyzers by name (in addition to default) - # run `go tool vet help` to see all analyzers - # default: [] - enable: [ ] - # enable all analyzers - # false by default - enable-all: false - # disable analyzers by name - # run `go tool vet help` to see all analyzers - # default: [] - disable: [ ] - importas: - # so not allow unaliased imports of aliased packages, false by default - no-unaliased: false - # so not allow non-required aliases, false by default - no-extra-aliases: false - # list of aliases, default: [] - alias: [ ] - interfacebloat: - # the maximum number of methods allowed for an interface - # 10 by default - max: 10 - lll: - # max line length, lines longer will be reported, 120 by default - # '\t' is counted as 1 character by default, and can be changed with the tab-width option - line-length: 130 - # tab width in spaces, 1 by default - tab-width: 2 - maintidx: - # show functions with maintainability index lower than N - # a high index indicates better maintainability (it's kind of the opposite of complexity) - # 20 by default - under: 20 - makezero: - # allow only slices initialized with a length of zero - # false by default - always: false - misspell: - # correct spellings using locale preferences for US or UK - # setting locale to US will correct the British spelling of 'colour' to 'color' - # default is to use a neutral variety of English. - locale: US - # default: [] - ignore-words: [ ] - nakedret: - # make an issue if func has more lines of code than this setting, and it has naked returns - # 30 by default - max-func-lines: 30 - nestif: - # minimal complexity of if statements to report, 5 by default - min-complexity: 4 - nilnil: - # checks that there is no simultaneous return of `nil` error and an invalid value - # default: ["ptr", "func", "iface", "map", "chan"] - checked-types: - - ptr - - func - - iface - - map - - chan - nolintlint: - # enable to ensure that nolint directives are all used. Default is true. - allow-unused: false - # disable to ensure that nolint directives don't have a leading space, true by default - allow-leading-space: true - # exclude following linters from requiring an explanation. Default is []. - allow-no-explanation: [ ] - # enable to require an explanation of nonzero length after each nolint directive, false by default - require-explanation: true - # enable to require nolint directives to mention the specific linter being suppressed, false by default - require-specific: true - nonamedreturns: - # report named error if it is assigned inside defer - # false by default - report-error-in-defer: false - prealloc: - # IMPORTANT: we don't recommend using this linter before doing performance profiling - # for most programs usage of prealloc will be a premature optimization - # report pre-allocation suggestions only on simple loops that have no returns/breaks/continues/gotos in them, true by default - simple: true - # report pre-allocation suggestions on range loops, true by default - range-loops: true - # Report pre-allocation suggestions on for loops, false by default - for-loops: false - promlinter: - # promlinter cannot infer all metrics name in static analysis - # enable strict mode will also include the errors caused by failing to parse the args - # false by default - strict: false - # please refer to https://github.com/yeya24/promlinter#usage for detailed usage - # default: [] - disabled-linters: [ ] - reassign: - # patterns for global variable names that are checked for reassignment - # see https://github.com/curioswitch/go-reassign#usage - # default: ["EOF", "Err.*"] - patterns: [ "EOF", "Err.*" ] - revive: - # maximum number of open files at the same time - # see https://github.com/mgechev/revive#command-line-flags - # defaults to unlimited. - max-open-files: 2048 - # when set to false, ignores files with "GENERATED" header, similar to golint - # see https://github.com/mgechev/revive#available-rules for details - # false by default - ignore-generated-header: false - # sets the default severity. - # see https://github.com/mgechev/revive#configuration - # warning by default - severity: warning - # enable all available rules - # default: false - enable-all-rules: false - # sets the default failure confidence - # this means that linting errors with less than 0.8 confidence will be ignored - # 0.8 by default - confidence: 0.8 - rules: - - name: atomic - - name: blank-imports - - name: bool-literal-in-expr - - name: call-to-gc - - name: constant-logical-expr - - name: context-as-argument - - name: context-keys-type - - name: defer - - name: dot-imports - - name: duplicated-imports - - name: early-return - - name: empty-block - - name: empty-lines - - name: error-naming - - name: error-return - - name: error-strings - - name: errorf - - name: get-return - - name: identical-branches - - name: if-return - - name: increment-decrement - - name: indent-error-flow - - name: optimize-operands-order - - name: package-comments - - name: range - - name: range-val-in-closure - - name: receiver-naming - - name: string-of-int - - name: struct-tag - - name: superfluous-else - - name: time-equal - - name: time-naming - - name: var-declaration - - name: unconditional-recursion - - name: unexported-naming - - name: unexported-return - - name: unnecessary-stmt - - name: unreachable-code - - name: unused-parameter - - name: unused-receiver - - name: useless-break - - name: waitgroup-by-value - rowserrcheck: - # database/sql is always checked - # default: [] - packages: [ ] - staticcheck: - # https://staticcheck.io/docs/configuration/options/#checks - # default: ["*"] - checks: [ "*" ] - stylecheck: - # https://staticcheck.io/docs/configuration/options/#checks - # default: ["*"] - checks: [ "*" ] - # https://staticcheck.io/docs/configuration/options/#dot_import_whitelist - # default: ["github.com/mmcloughlin/avo/build", "github.com/mmcloughlin/avo/operand", "github.com/mmcloughlin/avo/reg"] - dot-import-whitelist: [ ] - # https://staticcheck.io/docs/configuration/options/#initialisms - # default: ["ACL", "API", "ASCII", "CPU", "CSS", "DNS", "EOF", "GUID", "HTML", "HTTP", "HTTPS", "ID", "IP", "JSON", "QPS", "RAM", "RPC", "SLA", "SMTP", "SQL", "SSH", "TCP", "TLS", "TTL", "UDP", "UI", "GID", "UID", "UUID", "URI", "URL", "UTF8", "VM", "XML", "XMPP", "XSRF", "XSS", "SIP", "RTP", "AMQP", "DB", "TS"] - initialisms: [ "ACL", "API", "ASCII", "CPU", "CSS", "DNS", "EOF", "GUID", "HTML", "HTTP", "HTTPS", "ID", "IP", "JSON", "QPS", "RAM", "RPC", "SLA", "SMTP", "SQL", "SSH", "TCP", "TLS", "TTL", "UDP", "UI", "GID", "UID", "UUID", "URI", "URL", "UTF8", "VM", "XML", "XMPP", "XSRF", "XSS", "SIP", "RTP", "AMQP", "DB", "TS" ] - # https://staticcheck.io/docs/configuration/options/#http_status_code_whitelist - # default: ["200", "400", "404", "500"] - http-status-code-whitelist: [ "200", "400", "404", "500" ] - tagliatelle: - # check the struck tag name case. - case: - # use the struct field name to check the name of the struct tag - # false by default - use-field-name: false - # `camel` is used for `json` and `yaml` (can be overridden) - # default: {} - rules: { } - tenv: - # the option `all` will run against whole test files (`_test.go`) regardless of method/function signatures - # otherwise, only methods that take `*testing.T`, `*testing.B`, and `testing.TB` as arguments are checked - # false by default - all: false - thelper: - test: - # check *testing.T is first param (or after context.Context) of helper function - # true by default - first: true - # check *testing.T param has name t - # true by default - name: true - # check t.Helper() begins helper function - # true by default - begin: true - benchmark: - # check *testing.B is first param (or after context.Context) of helper function - # true by default - first: true - # check *testing.B param has name b - # true by default - name: true - # check b.Helper() begins helper function - # true by default - begin: true - tb: - # check *testing.TB is first param (or after context.Context) of helper function - # true by default - first: true - # check *testing.TB param has name tb - # true by default - name: true - # check tb.Helper() begins helper function - # true by default - begin: true - fuzz: - # check *testing.F is first param (or after context.Context) of helper function - # true by default - first: true - # check *testing.F param has name f - # true by default - name: true - # check f.Helper() begins helper function - # true by default - begin: true - usestdlibvars: - # suggest the use of http.MethodXX - # true by default - http-method: true - # suggest the use of http.StatusXX - # true by default - http-status-code: true - # suggest the use of time.Weekday - # true by default - time-weekday: true - # suggest the use of time.Month - # false by default - time-month: false - # suggest the use of time.Layout - # false by default - time-layout: false - # suggest the use of crypto.Hash - # false by default - crypto-hash: false - # suggest the use of rpc.DefaultXXPath - # false by default - default-rpc-path: false - unparam: - # inspect exported functions. - # - # set to true if no external program/library imports your code. - # XXX: if you enable this setting, unparam will report a lot of false-positives in text editors: - # if it's called for subdir of a project it can't find external interfaces. All text editor integrations - # with golangci-lint call it on a directory with the changed file - # - # false by default - check-exported: false - varnamelen: - # the longest distance, in source lines, that is being considered a "small scope". - # variables used in at most this many lines will be ignored - # 5 by default - max-distance: 5 - # the minimum length of a variable's name that is considered "long" - # variable names that are at least this long will be ignored - # 3 by default - min-name-length: 3 - # check method receivers - # false by default - check-receiver: false - # check named return values - # false by default - check-return: false - # check type parameters - # false by default - check-type-param: false - # ignore "ok" variables that hold the bool return value of a type assertion - # false by default - ignore-type-assert-ok: false - # Ignore "ok" variables that hold the bool return value of a map index - # false by default - ignore-map-index-ok: false - # ignore "ok" variables that hold the bool return value of a channel receive - # false by default - ignore-chan-recv-ok: false - # optional list of variable names that should be ignored completely - # default: [] - ignore-names: [ ] - # optional list of variable declarations that should be ignored completely - # entries must be in one of the following forms (see below for examples): - # - for variables, parameters, named return values, method receivers, or type parameters: - # ( can also be a pointer/slice/map/chan/...) - # - for constants: const - # - # default: [] - ignore-decls: [ ] - wsl: - # if true append is only allowed to be cuddled if appending value is - # matching variables, fields or types online above, true by default - strict-append: true - # allow calls and assignments to be cuddled as long as the lines have any - # matching variables, fields or types, true by default - allow-assign-and-call: true - # allow multiline assignments to be cuddled, true by default - allow-multiline-assign: true - # allow declarations (var) to be cuddled - allow-cuddle-declarations: false - # allow trailing comments in ending of blocks - allow-trailing-comment: false - # force newlines in end of case at this limit (0 = never) - force-case-trailing-whitespace: 0 - # force cuddling of err checks with err var assignment - force-err-cuddling: false - # allow leading comments to be separated with empty liens - allow-separated-leading-comment: false - wrapcheck: - # an array of strings that specify substrings of signatures to ignore - # if this set, it will override the default set of ignored signatures - # see https://github.com/tomarrell/wrapcheck#configuration for more information - # default: [".Errorf(", "errors.New(", "errors.Unwrap(", ".Wrap(", ".Wrapf(", ".WithMessage(", ".WithMessagef(", ".WithStack("] - ignoreSigs: - - .Errorf( - - errors.New( - - errors.Unwrap( - - .Wrap( - - .Wrapf( - - .WithMessage( - - .WithMessagef( - - .WithStack( - # an array of strings that specify regular expressions of signatures to ignore - # default: [] - ignoreSigRegexps: [ ] - # an array of strings that specify globs of packages to ignore - # default: [] - ignorePackageGlobs: [ ] - # an array of strings that specify regular expressions of interfaces to ignore - # default: [] - ignoreInterfaceRegexps: [ ] - - # the custom section can be used to define linter plugins to be loaded at runtime - # see README documentation for more info -issues: - # Excluding configuration per-path, per-linter, per-text and per-source - exclude-rules: - - path: _test\.go - linters: - # - gocyclo - - errcheck - - gosec - - dupl - # - gocognit - - funlen - # independently of option `exclude` we use default exclude patterns, - # it can be disabled by this option. To list all - # excluded by default patterns execute `golangci-lint run --help` - # default value for this option is true - exclude-use-default: false - # the default value is false. If set to true exclude and exclude-rules - # regular expressions become case sensitive - exclude-case-sensitive: false - # the list of ids of default excludes to include or disable. Default is empty. - include: - - EXC0002 # disable excluding of issues about comments from golint - # maximum issues count per one linter. Set to 0 to disable. Default is 50 - max-issues-per-linter: 0 - # maximum count of issues with the same text. Set to 0 to disable. Default is 3 - max-same-issues: 0 - # Show only new issues: if there are unstaged changes or untracked files, - # only those changes are analyzed, else only changes in HEAD~ are analyzed - # It's a super-useful option for integration of golangci-lint into existing - # large codebase. It's not practical to fix all existing issues at the moment - # of integration: much better don't allow issues in new code. - # Default is false. - new: true -severity: - # Default value is empty string. - # Set the default severity for issues. If severity rules are defined and the issues - # do not match or no severity is provided to the rule this will be the default - # severity applied. Severities should match the supported severity names of the - # selected out format. - # - Code climate: https://docs.codeclimate.com/docs/issues#issue-severity - # - Checkstyle: https://checkstyle.sourceforge.io/property_types.html#severity - # - Github: https://help.github.com/en/actions/reference/workflow-commands-for-github-actions#setting-an-error-message - default-severity: error - # The default value is false. - # If set to true severity-rules regular expressions become case-sensitive. - case-sensitive: false - # Default value is empty list. - # When a list of severity rules are provided, severity information will be added to lint - # issues. Severity rules have the same filtering capability as exclude rules except you - # are allowed to specify one matcher per severity rule. - # Only affects out formats that support setting severity information. - rules: - - linters: - - dupl - severity: info diff --git a/README.md b/README.md new file mode 100644 index 0000000..7a28ec2 --- /dev/null +++ b/README.md @@ -0,0 +1,24 @@ +# Dataphos Docs + +This repository contains the official documentation and examples of the Dataphos platform components. The following link will lead you to the official web page where the documentation is hosted: [docs.dataphos.com](https://docs.dataphos.com/). + +## Usage +### How to Run the Site Locally + +1. Install [Hugo](https://gohugo.io/). +2. In the `dataphos-docs/themes/hugo-geekdoc` folder, run `npm install` and `npm run build`. +3. Navigate to the `dataphos-docs` folder and run `hugo server -D`. This should result in the server running on `localhost:1313`. + +### How to Edit the Content + +The content is managed as a set of Markdown files in the `dataphos-docs/content` folder. Every markdown file is its own static page. See the `persistor/quickstart.md` as an example of how to utilize some basic editing. + +The order the pages will be displayed in the sidebar is determined by the `weight` parameter of the Markdown file header. + +### How to Edit the Look + +You can configure the look and feel of the site by editing the `static/custom.css` file. + +## Examples + +Any examples or files used as part of the deployment instructions are made available in the `examples` folder of this repository. \ No newline at end of file diff --git a/STYLEGUIDE.md b/STYLEGUIDE.md deleted file mode 100644 index 9c98c53..0000000 --- a/STYLEGUIDE.md +++ /dev/null @@ -1,3554 +0,0 @@ - - -# Dataphos Go Style Guide - -## Table of Contents - -- [Introduction](#introduction) -- [Guidelines](#guidelines) - - [Pointers to Interfaces](#pointers-to-interfaces) - - [Verify Interface Compliance](#verify-interface-compliance) - - [Receivers and Interfaces](#receivers-and-interfaces) - - [Zero-value Mutexes are Valid](#zero-value-mutexes-are-valid) - - [Copy Slices and Maps at Boundaries](#copy-slices-and-maps-at-boundaries) - - [Defer to Clean Up](#defer-to-clean-up) - - [Channel Size is One or None](#channel-size-is-one-or-none) - - [Use Channels Carefully](#use-channels-carefully) - - [Start Enums at One](#start-enums-at-one) - - [Use `"time"` to handle time](#use-time-to-handle-time) - - [Errors](#errors) - - [Error Types](#error-types) - - [Error Wrapping](#error-wrapping) - - [Error Naming](#error-naming) - - [Alternative to fmt.Errorf](#alternative-to-fmterrorf) - - [Handle Type Assertion Failures](#handle-type-assertion-failures) - - [Don't Panic](#dont-panic) - - [Avoid Mutable Globals](#avoid-mutable-globals) - - [Avoid Embedding Types in Public Structs](#avoid-embedding-types-in-public-structs) - - [Avoid Using Built-In Names](#avoid-using-built-in-names) - - [Avoid `init()`](#avoid-init) - - [Exit in Main](#exit-in-main) - - [Exit Once](#exit-once) -- [Performance](#performance) - - [Prefer strconv over fmt](#prefer-strconv-over-fmt) - - [Avoid string-to-byte conversion](#avoid-string-to-byte-conversion) - - [Prefer Specifying Container Capacity](#prefer-specifying-container-capacity) - - [Specifying Map Capacity Hints](#specifying-map-capacity-hints) - - [Specifying Slice Capacity](#specifying-slice-capacity) -- [Style](#style) - - [Avoid overly long lines](#avoid-overly-long-lines) - - [Be Consistent](#be-consistent) - - [Avoid Side Effects](#avoid-side-effects) - - [Favor Pure Functions](#favour-pure-functions) - - [Don't Over Interface](#dont-over-interface) - - [Don't Under Package](#dont-under-package) - - [Keep The Happy Path Left](#keep-the-happy-path-left) - - [Group Similar Declarations](#group-similar-declarations) - - [Import Group Ordering](#import-group-ordering) - - [Package Names](#package-names) - - [Function Names](#function-names) - - [Import Aliasing](#import-aliasing) - - [Function Grouping and Ordering](#function-grouping-and-ordering) - - [Reduce Nesting](#reduce-nesting) - - [Unnecessary Else](#unnecessary-else) - - [Top-level Variable Declarations](#top-level-variable-declarations) - - [Embedding in Structs](#embedding-in-structs) - - [Local Variable Declarations](#local-variable-declarations) - - [nil is a valid slice](#nil-is-a-valid-slice) - - [Reduce Scope of Variables](#reduce-scope-of-variables) - - [Avoid Naked Parameters](#avoid-naked-parameters) - - [Use Raw String Literals to Avoid Escaping](#use-raw-string-literals-to-avoid-escaping) - - [Initializing Structs](#initializing-structs) - - [Use Field Names to Initialize Structs](#use-field-names-to-initialize-structs) - - [Omit Zero Value Fields in Structs](#omit-zero-value-fields-in-structs) - - [Use `var` for Zero Value Structs](#use-var-for-zero-value-structs) - - [Initializing Struct References](#initializing-struct-references) - - [Initializing Maps](#initializing-maps) -- [Patterns](#patterns) - - [Test Tables](#test-tables) - - [Functional Options](#functional-options) -- [Linting](#linting) - -## Introduction - -Styles are the conventions that govern our code. The term style is a bit of a -misnomer, since these conventions cover far more than just source file -formatting—gofmt handles that for us. - -The goal of this guide is to manage this complexity by describing in detail the -Dos and Don'ts of writing Go code. These rules exist to keep the code -base manageable while still allowing engineers to use Go language features -productively. - -This guide is a slight adaptation of Ubers style guide which can be found here: - -A lot of these are general guidelines for Go, while others extend upon external -resources: - -1. [Effective Go](https://golang.org/doc/effective_go.html) -2. [Go Common Mistakes](https://github.com/golang/go/wiki/CommonMistakes) -3. [Go Code Review Comments](https://github.com/golang/go/wiki/CodeReviewComments) - -All code should be error-free when run through `golint` and `go vet`. We -recommend setting up your editor to: - -- Run `goimports` on save -- Run `golint` and `go vet` to check for errors - -You can find information in editor support for Go tools here: - - -## Guidelines - -### Pointers to Interfaces - -You almost never need a pointer to an interface. You should be passing -interfaces as values—the underlying data can still be a pointer. - -An interface is two fields: - -1. A pointer to some type-specific information. You can think of this as "type." -2. Data pointer. If the data stored is a pointer, it’s stored directly. If the data stored is a value, then a pointer to the value is stored. - -If you want interface methods to modify the underlying data, you must use a -pointer. - -### Verify Interface Compliance - -Verify interface compliance at compile time where appropriate. This includes: - -- Exported types that are required to implement specific interfaces as part of - their API contract -- Exported or unexported types that are part of a collection of types - implementing the same interface -- Other cases where violating an interface would break users - - - - - -
BadGood
- -```go -type Handler struct { - // ... -} - - - -func (h *Handler) ServeHTTP( - w http.ResponseWriter, - r *http.Request, -) { - ... -} -``` - - - -```go -type Handler struct { - // ... -} - -var _ http.Handler = (*Handler)(nil) - -func (h *Handler) ServeHTTP( - w http.ResponseWriter, - r *http.Request, -) { - // ... -} -``` - -
- -The statement `var _ http.Handler = (*Handler)(nil)` will fail to compile if -`*Handler` ever stops matching the `http.Handler` interface. - -The right hand side of the assignment should be the zero value of the asserted -type. This is `nil` for pointer types (like `*Handler`), slices, and maps, and -an empty struct for struct types. - -```go -type LogHandler struct { - h http.Handler - log *zap.Logger -} - -var _ http.Handler = LogHandler{} - -func (h LogHandler) ServeHTTP( - w http.ResponseWriter, - r *http.Request, -) { - // ... -} -``` - -### Receivers and Interfaces - -Methods with value receivers can be called on pointers as well as values. -Methods with pointer receivers can only be called on pointers or [addressable values]. - -[addressable values]: https://golang.org/ref/spec#Method_values - -For example, - -```go -type S struct { - data string -} - -func (s S) Read() string { - return s.data -} - -func (s *S) Write(str string) { - s.data = str -} - -sVals := map[int]S{1: {"A"}} - -// You can only call Read using a value -sVals[1].Read() - -// This will not compile: -// sVals[1].Write("test") - -sPtrs := map[int]*S{1: {"A"}} - -// You can call both Read and Write using a pointer -sPtrs[1].Read() -sPtrs[1].Write("test") -``` - -Similarly, an interface can be satisfied by a pointer, even if the method has a -value receiver. - -```go -type F interface { - f() -} - -type S1 struct{} - -func (s S1) f() {} - -type S2 struct{} - -func (s *S2) f() {} - -s1Val := S1{} -s1Ptr := &S1{} -s2Val := S2{} -s2Ptr := &S2{} - -var i F -i = s1Val -i = s1Ptr -i = s2Ptr - -// The following doesn't compile, since s2Val is a value, and there is no value receiver for f. -// i = s2Val -``` - -Effective Go has a good write up on [Pointers vs. Values]. - -[Pointers vs. Values]: https://golang.org/doc/effective_go.html#pointers_vs_values - -### Zero-value Mutexes are Valid - -The zero-value of `sync.Mutex` and `sync.RWMutex` is valid, so you almost -never need a pointer to a mutex. - - - - - -
BadGood
- -```go -mu := new(sync.Mutex) -mu.Lock() -``` - - - -```go -var mu sync.Mutex -mu.Lock() -``` - -
- -If you use a struct by pointer, then the mutex should be a non-pointer field on -it. Do not embed the mutex on the struct, even if the struct is not exported. - - - - - - - -
BadGood
- -```go -type SMap struct { - sync.Mutex - - data map[string]string -} - -func NewSMap() *SMap { - return &SMap{ - data: make(map[string]string), - } -} - -func (m *SMap) Get(k string) string { - m.Lock() - defer m.Unlock() - - return m.data[k] -} -``` - - - -```go -type SMap struct { - mu sync.Mutex - - data map[string]string -} - -func NewSMap() *SMap { - return &SMap{ - data: make(map[string]string), - } -} - -func (m *SMap) Get(k string) string { - m.mu.Lock() - defer m.mu.Unlock() - - return m.data[k] -} -``` - -
- -The `Mutex` field, and the `Lock` and `Unlock` methods are unintentionally part -of the exported API of `SMap`. - - - -The mutex and its methods are implementation details of `SMap` hidden from its -callers. - -
- -### Copy Slices and Maps at Boundaries - -Slices and maps contain pointers to the underlying data so be wary of scenarios -when they need to be copied. - -#### Receiving Slices and Maps - -Keep in mind that users can modify a map or slice you received as an argument -if you store a reference to it. - - - - - - - - - - -
Bad Good
- -```go -func (d *Driver) SetTrips(trips []Trip) { - d.trips = trips -} - -trips := ... -d1.SetTrips(trips) - -// Did you mean to modify d1.trips? -trips[0] = ... -``` - - - -```go -func (d *Driver) SetTrips(trips []Trip) { - d.trips = make([]Trip, len(trips)) - copy(d.trips, trips) -} - -trips := ... -d1.SetTrips(trips) - -// We can now modify trips[0] without affecting d1.trips. -trips[0] = ... -``` - -
- -#### Returning Slices and Maps - -Similarly, be wary of user modifications to maps or slices exposing internal -state. - - - - - -
BadGood
- -```go -type Stats struct { - mu sync.Mutex - counters map[string]int -} - -// Snapshot returns the current stats. -func (s *Stats) Snapshot() map[string]int { - s.mu.Lock() - defer s.mu.Unlock() - - return s.counters -} - -// snapshot is no longer protected by the mutex, so any -// access to the snapshot is subject to data races. -snapshot := stats.Snapshot() -``` - - - -```go -type Stats struct { - mu sync.Mutex - counters map[string]int -} - -func (s *Stats) Snapshot() map[string]int { - s.mu.Lock() - defer s.mu.Unlock() - - result := make(map[string]int, len(s.counters)) - for k, v := range s.counters { - result[k] = v - } - return result -} - -// Snapshot is now a copy. -snapshot := stats.Snapshot() -``` - -
- -### Defer to Clean Up - -Use defer to clean up resources such as files and locks. - - - - - -
BadGood
- -```go -p.Lock() -if p.count < 10 { - p.Unlock() - return p.count -} - -p.count++ -newCount := p.count -p.Unlock() - -return newCount - -// easy to miss unlocks due to multiple returns -``` - - - -```go -p.Lock() -defer p.Unlock() - -if p.count < 10 { - return p.count -} - -p.count++ -return p.count - -// more readable -``` - -
- -Defer has an extremely small overhead and should be avoided only if you can -prove that your function execution time is in the order of nanoseconds. The -readability win of using defers is worth the miniscule cost of using them. This -is especially true for larger methods that have more than simple memory -accesses, where the other computations are more significant than the `defer`. - -### Channel Size is One or None - -Channels should usually have a size of one or be unbuffered. By default, -channels are unbuffered and have a size of zero. Any other size -must be subject to a high level of scrutiny, meaning there should be a measurable reason to use some other size. -Consider how the size is determined, what prevents the channel from filling up under load and blocking -writers, and what happens when this occurs. - - - - - -
BadGood
- -```go -// Ought to be enough for anybody! -c := make(chan int, 64) -``` - - - -```go -// Size of one -c := make(chan int, 1) // or -// Unbuffered channel, size of zero -c := make(chan int) -``` - -
- -### Use Channels Carefully - -Go's channels are immensely powerful, allowing you to write clean, but complex concurrent code. However, they are not normal data structures and -shouldn't be used as such. Creation of a channel should probably be tightly coupled to goroutines -that will be performing writes on it so that we can reason about its behavior and performance more easily. -The goroutine that owns a channel should: instantiate the channel, perform writes, or pass ownership to another goroutine, -close the channel, encapsulate the previous three things and expose them via a reader channel. Not doing so has the potential to cause deadlocks and -other non-deterministic bugs which can't be caught through unit tests. -Basically, each channel comes with an implicit "contract", (unknowingly) created by the programmer, which dictates how the channel can be safely -used in that specific use case; therefore, if a single channel is used across a substantial chunk of code, every other programmer needs to be aware of that contract. -Failing to do so will almost certainly lead to catastrophic bugs. - -A good, idiomatic way to use channels is to always encapsulate the channel related stuff through straightforward, blocking methods or through a "result" channel, -which significantly decreases the complexity of that implicit contract. The calling code should never be aware of channel-writing implementation details! -An example of this is the highly popular and highly encouraged "pipeline" pattern, which acts similarly to generators in python. - -```go -func IntGenerator(done <-chan interface{}, integers ...int) <-chan int { - intStream := make(chan int) - - go func() { - defer close(intStream) - - for _, i := range integers { - select { - case <-done: - return - case intStream <- i: - } - } - }() - - return intStream -} -``` - - -### Start Enums at One - -The standard way of introducing enumerations in Go is to declare a custom type -and a `const` group with `iota`. Since variables have a 0 default value, you -should usually start your enums on a non-zero value. - - - - - -
BadGood
- -```go -type Operation int - -const ( - Add Operation = iota - Subtract - Multiply -) - -// Add=0, Subtract=1, Multiply=2 -``` - - - -```go -type Operation int - -const ( - Add Operation = iota + 1 - Subtract - Multiply -) - -// Add=1, Subtract=2, Multiply=3 -``` - -
- -There are cases where using the zero value makes sense, for example when the -zero value case is the desirable default behavior. - -```go -type LogOutput int - -const ( - LogToStdout LogOutput = iota - LogToFile - LogToRemote -) - -// LogToStdout=0, LogToFile=1, LogToRemote=2 -``` - -### Use `"time"` to handle time - -Time is complicated. Incorrect assumptions often made about time include the -following. - -1. A day has 24 hours -2. An hour has 60 minutes -3. A week has 7 days -4. A year has 365 days -5. [And a lot more](https://infiniteundo.com/post/25326999628/falsehoods-programmers-believe-about-time) - -For example, *1* means that adding 24 hours to a time instant will not always -yield a new calendar day. - -Therefore, always use the [`"time"`] package when dealing with time because it -helps deal with these incorrect assumptions in a safer, more accurate manner. - -[`"time"`]: https://golang.org/pkg/time/ - -#### Use `time.Time` for instants of time - -Use [`time.Time`] when dealing with instants of time, and the methods on -`time.Time` when comparing, adding, or subtracting time. - -[`time.Time`]: https://golang.org/pkg/time/#Time - - - - - -
BadGood
- -```go -func isActive(now, start, stop int) bool { - return start <= now && now < stop -} -``` - - - -```go -func isActive(now, start, stop time.Time) bool { - return (start.Before(now) || start.Equal(now)) && now.Before(stop) -} -``` - -
- -#### Use `time.Duration` for periods of time - -Use [`time.Duration`] when dealing with periods of time and never rely on an "agreement" that some integer should be interpreted as seconds. -This is especially important because `time.Duration` is actually just a wrapper around `int64`, with nanoseconds as base/unit values. - -This means that if you see code similar to this, it is almost certainly not what the programmer meant, and this should never pass any PR: -```go -time.Sleep(5) -``` - -[`time.Duration`]: https://golang.org/pkg/time/#Duration - - - - - -
BadGood
- -```go -func poll(delay int) { - for { - // ... - time.Sleep(time.Duration(delay) * time.Millisecond) - } -} - -poll(10) // was it seconds or milliseconds? -``` - - - -```go -func poll(delay time.Duration) { - for { - // ... - time.Sleep(delay) - } -} - -poll(10*time.Second) -``` - -
- -Going back to the example of adding 24 hours to a time instant, the method we -use to add time depends on intent. If we want the same time of the day, but on -the next calendar day, we should use [`Time.AddDate`]. However, if we want an -instant of time guaranteed to be 24 hours after the previous time, we should -use [`Time.Add`]. - -[`Time.AddDate`]: https://golang.org/pkg/time/#Time.AddDate -[`Time.Add`]: https://golang.org/pkg/time/#Time.Add - -```go -newDay := t.AddDate(0 /* years */, 0 /* months */, 1 /* days */) -maybeNewDay := t.Add(24 * time.Hour) -``` - -#### Use `time.Time` and `time.Duration` with external systems - -Use `time.Duration` and `time.Time` in interactions with external systems when -possible. For example: - -- Command-line flags: [`flag`] supports `time.Duration` via - [`time.ParseDuration`] -- JSON: [`encoding/json`] supports encoding `time.Time` as an [RFC 3339] - string via its [`UnmarshalJSON` method] -- SQL: [`database/sql`] supports converting `DATETIME` or `TIMESTAMP` columns - into `time.Time` and back if the underlying driver supports it -- YAML: [`gopkg.in/yaml.v2`] supports `time.Time` as an [RFC 3339] string, and - `time.Duration` via [`time.ParseDuration`]. - - [`flag`]: https://golang.org/pkg/flag/ - [`time.ParseDuration`]: https://golang.org/pkg/time/#ParseDuration - [`encoding/json`]: https://golang.org/pkg/encoding/json/ - [RFC 3339]: https://tools.ietf.org/html/rfc3339 - [`UnmarshalJSON` method]: https://golang.org/pkg/time/#Time.UnmarshalJSON - [`database/sql`]: https://golang.org/pkg/database/sql/ - [`gopkg.in/yaml.v2`]: https://godoc.org/gopkg.in/yaml.v2 - -When it is not possible to use `time.Duration` in these interactions, use -`int` or `float64` and include the unit in the name of the field. - -For example, since `encoding/json` does not support `time.Duration`, the unit -is included in the name of the field. - - - - - -
BadGood
- -```go -// {"interval": 2} -type Config struct { - Interval int `json:"interval"` -} -``` - - - -```go -// {"intervalMillis": 2000} -type Config struct { - IntervalMillis int `json:"intervalMillis"` -} -``` - -
- -When it is not possible to use `time.Time` in these interactions, unless an -alternative is agreed upon, use `string` and format timestamps as defined in -[RFC 3339]. This format is used by default by [`Time.UnmarshalText`] and is -available for use in `Time.Format` and `time.Parse` via [`time.RFC3339`]. - -[`Time.UnmarshalText`]: https://golang.org/pkg/time/#Time.UnmarshalText -[`time.RFC3339`]: https://golang.org/pkg/time/#RFC3339 - -Although this tends to not be a problem in practice, keep in mind that the -`"time"` package does not support parsing timestamps with leap seconds -([8728]), nor does it account for leap seconds in calculations ([15190]). If -you compare two instants of time, the difference will not include the leap -seconds that may have occurred between those two instants. - -[8728]: https://github.com/golang/go/issues/8728 -[15190]: https://github.com/golang/go/issues/15190 - -### Errors - -#### Error Types - -There are few options for declaring errors. -Consider the following before picking the option best suited for your use case. - -- Does the caller need to match the error so that they can handle it? - If yes, we must support the [`errors.Is`] or [`errors.As`] functions - by declaring a top-level error variable or a custom type. -- Is the error message a static string, - or is it a dynamic string that requires contextual information? - For the former, we can use [`errors.New`], but for the latter we must - use [`fmt.Errorf`] or a custom error type. -- Are we propagating a new error returned by a downstream function? - If so, see the [section on error wrapping](#error-wrapping). - -[`errors.Is`]: https://golang.org/pkg/errors/#Is -[`errors.As`]: https://golang.org/pkg/errors/#As - -| Error matching? | Error Message | Guidance | -|-----------------|---------------|-------------------------------------| -| No | static | [`errors.New`] | -| No | dynamic | [`fmt.Errorf`] | -| Yes | static | top-level `var` with [`errors.New`] | -| Yes | dynamic | custom `error` type | - -[`errors.New`]: https://golang.org/pkg/errors/#New -[`fmt.Errorf`]: https://golang.org/pkg/fmt/#Errorf - -For example, -use [`errors.New`] for an error with a static string. -Export this error as a variable to support matching it with `errors.Is` -if the caller needs to match and handle this error. - - - - - -
No error matchingError matching
- -```go -// package foo - -func Open() error { - return errors.New("could not open") -} - -// package bar - -if err := foo.Open(); err != nil { - // Can't handle the error. - panic("unknown error") -} -``` - - - -```go -// package foo - -var ErrCouldNotOpen = errors.New("could not open") - -func Open() error { - return ErrCouldNotOpen -} - -// package bar - -if err := foo.Open(); err != nil { - if errors.Is(err, foo.ErrCouldNotOpen) { - // handle the error - } else { - panic("unknown error") - } -} -``` - -
- -For an error with a dynamic string, -use [`fmt.Errorf`] if the caller does not need to match it, -and a custom `error` if the caller does need to match it. - - - - - -
No error matchingError matching
- -```go -// package foo - -func Open(file string) error { - return fmt.Errorf("file %q not found", file) -} - -// package bar - -if err := foo.Open("testfile.txt"); err != nil { - // Can't handle the error. - panic("unknown error") -} -``` - - - -```go -// package foo - -type NotFoundError struct { - File string -} - -func (e *NotFoundError) Error() string { - return fmt.Sprintf("file %q not found", e.File) -} - -func Open(file string) error { - return &NotFoundError{File: file} -} - - -// package bar - -if err := foo.Open("testfile.txt"); err != nil { - var notFound *NotFoundError - if errors.As(err, ¬Found) { - // handle the error - } else { - panic("unknown error") - } -} -``` - -
- -Note that if you export error variables or types from a package, -they will become part of the public API of the package. - -#### Error Wrapping - -There are three main options for propagating errors if a call fails: - -- return the original error as-is -- add context with `fmt.Errorf` and the `%w` verb -- add context with `fmt.Errorf` and the `%v` verb - -Return the original error as-is if there is no additional context to add. -This maintains the original error type and message. -This is well suited for cases when the underlying error message -has sufficient information to track down where it came from. - -Otherwise, add context to the error message where possible -so that instead of a vague error such as "connection refused", -you get more useful errors such as "call service foo: connection refused". - -Use `fmt.Errorf` to add context to your errors, -picking between the `%w` or `%v` verbs -based on whether the caller should be able to -match and extract the underlying cause. - -- Use `%w` if the caller should have access to the underlying error. - This is a good default for most wrapped errors, - but be aware that callers may begin to rely on this behavior. - So for cases where the wrapped error is a known `var` or type, - document and test it as part of your function's contract. -- Use `%v` to obfuscate the underlying error. - Callers will be unable to match it, - but you can switch to `%w` in the future if needed. - -When adding context to returned errors, keep the context succinct by avoiding -phrases like "failed to", which state the obvious and pile up as the error -percolates up through the stack: - - - - - -
BadGood
- -```go -s, err := store.New() -if err != nil { - return fmt.Errorf( - "failed to create new store: %w", err) -} -``` - - - -```go -s, err := store.New() -if err != nil { - return fmt.Errorf( - "new store: %w", err) -} -``` - -
- -``` -failed to x: failed to y: failed to create new store: the error -``` - - - -``` -x: y: new store: the error -``` - -
- -However once the error is sent to another system, it should be clear the -message is an error (e.g. an `err` tag or "Failed" prefix in logs). - -See also [Don't just check errors, handle them gracefully]. - -[`"pkg/errors".Cause`]: https://godoc.org/github.com/pkg/errors#Cause -[Don't just check errors, handle them gracefully]: https://dave.cheney.net/2016/04/27/dont-just-check-errors-handle-them-gracefully - -#### Error Naming - -For error values stored as global variables, -use the prefix `Err` or `err` depending on whether they're exported. -This guidance supersedes the [Prefix Unexported Globals with _](#prefix-unexported-globals-with-_). - -```go -var ( - // The following two errors are exported - // so that users of this package can match them - // with errors.Is. - - ErrBrokenLink = errors.New("link is broken") - ErrCouldNotOpen = errors.New("could not open") - - // This error is not exported because - // we don't want to make it part of our public API. - // We may still use it inside the package - // with errors.Is. - - errNotFound = errors.New("not found") -) -``` - -For custom error types, use the suffix `Error` instead. - -```go -// Similarly, this error is exported -// so that users of this package can match it -// with errors.As. - -type NotFoundError struct { - File string -} - -func (e *NotFoundError) Error() string { - return fmt.Sprintf("file %q not found", e.File) -} - -// And this error is not exported because -// we don't want to make it part of the public API. -// We can still use it inside the package -// with errors.As. - -type resolveError struct { - Path string -} - -func (e *resolveError) Error() string { - return fmt.Sprintf("resolve %q", e.Path) -} -``` - -#### Alternative to fmt.Errorf - -A syntactically simpler, but otherwise equivalent package to `fmt` is `github.com/pkg/errors`, which offers helper methods -so that you don't have to use formatted strings for error wrapping. This however, is a matter of taste, and either one is fine, -but avoid mixing the two packages; if a codebase uses `fmt.Errorf(oops: %w, err)` then don't use `errors.Wrap(err, "oops")`, or vice-versa. - -### Handle Type Assertion Failures - -The single return value form of a [type assertion] will panic on an incorrect -type. Therefore, always use the "comma ok" idiom. - -[type assertion]: https://golang.org/ref/spec#Type_assertions - - - - - -
BadGood
- -```go -t := i.(string) -``` - - - -```go -t, ok := i.(string) -if !ok { - // handle the error gracefully -} -``` - -
- -### Don't Panic - -Code running in production must avoid panics. Panics are a major source of -[cascading failures]. If an error occurs, the function must return an error and -allow the caller to decide how to handle it. - -[cascading failures]: https://en.wikipedia.org/wiki/Cascading_failure - - - - - -
BadGood
- -```go -func run(args []string) { - if len(args) == 0 { - panic("an argument is required") - } - // ... -} - -func main() { - run(os.Args[1:]) -} -``` - - - -```go -func run(args []string) error { - if len(args) == 0 { - return errors.New("an argument is required") - } - // ... - return nil -} - -func main() { - if err := run(os.Args[1:]); err != nil { - fmt.Fprintln(os.Stderr, err) - os.Exit(1) - } -} -``` - -
- -Panic/recover is not an error handling strategy. A program must panic only when -something irrecoverable happens such as a nil dereference. An exception to this is -program initialization: bad things at program startup that should abort the -program may cause panic. - -```go -var _statusTemplate = template.Must(template.New("name").Parse("_statusHTML")) -``` - -Even in tests, prefer `t.Fatal` or `t.FailNow` over panics to ensure that the -test is marked as failed. - - - - - -
BadGood
- -```go -// func TestFoo(t *testing.T) - -f, err := ioutil.TempFile("", "test") -if err != nil { - panic("failed to set up test") -} -``` - - - -```go -// func TestFoo(t *testing.T) - -f, err := ioutil.TempFile("", "test") -if err != nil { - t.Fatal("failed to set up test") -} -``` - -
- -### Avoid Mutable Globals - -Avoid mutating global variables, instead opting for dependency injection. -This applies to function pointers as well as other kinds of values. - - - - - - -
BadGood
- -```go -// sign.go - -var _timeNow = time.Now - -func sign(msg string) string { - now := _timeNow() - return signWithTime(msg, now) -} -``` - - - -```go -// sign.go - -type signer struct { - now func() time.Time -} - -func newSigner() *signer { - return &signer{ - now: time.Now, - } -} - -func (s *signer) Sign(msg string) string { - now := s.now() - return signWithTime(msg, now) -} -``` -
- -```go -// sign_test.go - -func TestSign(t *testing.T) { - oldTimeNow := _timeNow - _timeNow = func() time.Time { - return someFixedTime - } - defer func() { _timeNow = oldTimeNow }() - - assert.Equal(t, want, sign(give)) -} -``` - - - -```go -// sign_test.go - -func TestSigner(t *testing.T) { - s := newSigner() - s.now = func() time.Time { - return someFixedTime - } - - assert.Equal(t, want, s.Sign(give)) -} -``` - -
- -### Avoid Embedding Types in Public Structs - -These embedded types leak implementation details, inhibit type evolution, and -obscure documentation. - -Assuming you have implemented a variety of list types using a shared -`AbstractList`, avoid embedding the `AbstractList` in your concrete list -implementations. -Instead, hand-write only the methods to your concrete list that will delegate -to the abstract list. - -```go -type AbstractList struct {} - -// Add adds an entity to the list. -func (l *AbstractList) Add(e Entity) { - // ... -} - -// Remove removes an entity from the list. -func (l *AbstractList) Remove(e Entity) { - // ... -} -``` - - - - - -
BadGood
- -```go -// ConcreteList is a list of entities. -type ConcreteList struct { - *AbstractList -} -``` - - - -```go -// ConcreteList is a list of entities. -type ConcreteList struct { - list *AbstractList -} - -// Add adds an entity to the list. -func (l *ConcreteList) Add(e Entity) { - l.list.Add(e) -} - -// Remove removes an entity from the list. -func (l *ConcreteList) Remove(e Entity) { - l.list.Remove(e) -} -``` - -
- -Go allows [type embedding] as a compromise between inheritance and composition. -The outer type gets implicit copies of the embedded type's methods. -These methods, by default, delegate to the same method of the embedded -instance. - -[type embedding]: https://golang.org/doc/effective_go.html#embedding - -The struct also gains a field by the same name as the type. -So, if the embedded type is public, the field is public. -To maintain backward compatibility, every future version of the outer type must -keep the embedded type. - -An embedded type is rarely necessary. -It is a convenience that helps you avoid writing tedious delegate methods. - -Even embedding a compatible AbstractList *interface*, instead of the struct, -would offer the developer more flexibility to change in the future, but still -leak the detail that the concrete lists use an abstract implementation. - - - - - -
BadGood
- -```go -// AbstractList is a generalized implementation -// for various kinds of lists of entities. -type AbstractList interface { - Add(Entity) - Remove(Entity) -} - -// ConcreteList is a list of entities. -type ConcreteList struct { - AbstractList -} -``` - - - -```go -// ConcreteList is a list of entities. -type ConcreteList struct { - list AbstractList -} - -// Add adds an entity to the list. -func (l *ConcreteList) Add(e Entity) { - l.list.Add(e) -} - -// Remove removes an entity from the list. -func (l *ConcreteList) Remove(e Entity) { - l.list.Remove(e) -} -``` - -
- -Either with an embedded struct or an embedded interface, the embedded type -places limits on the evolution of the type. - -- Adding methods to an embedded interface is a breaking change. -- Removing methods from an embedded struct is a breaking change. -- Removing the embedded type is a breaking change. -- Replacing the embedded type, even with an alternative that satisfies the same - interface, is a breaking change. - -Although writing these delegate methods is tedious, the additional effort hides -an implementation detail, leaves more opportunities for change, and also -eliminates indirection for discovering the full List interface in -documentation. - -### Avoid Using Built-In Names - -The Go [language specification] outlines several built-in, -[predeclared identifiers] that should not be used as names within Go programs. - -Depending on context, reusing these identifiers as names will either shadow -the original within the current lexical scope (and any nested scopes) or make -affected code confusing. In the best case, the compiler will complain; in the -worst case, such code may introduce latent, hard-to-grep bugs. - -[language specification]: https://golang.org/ref/spec -[predeclared identifiers]: https://golang.org/ref/spec#Predeclared_identifiers - - - - - - -
BadGood
- -```go -var error string -// `error` shadows the builtin - -// or - -func handleErrorMessage(error string) { - // `error` shadows the builtin -} -``` - - - -```go -var errorMessage string -// `error` refers to the builtin - -// or - -func handleErrorMessage(msg string) { - // `error` refers to the builtin -} -``` - -
- -```go -type Foo struct { - // While these fields technically don't - // constitute shadowing, grepping for - // `error` or `string` strings is now - // ambiguous. - error error - string string -} - -func (f Foo) Error() error { - // `error` and `f.error` are - // visually similar - return f.error -} - -func (f Foo) String() string { - // `string` and `f.string` are - // visually similar - return f.string -} -``` - - - -```go -type Foo struct { - // `error` and `string` strings are - // now unambiguous. - err error - str string -} - -func (f Foo) Error() error { - return f.err -} - -func (f Foo) String() string { - return f.str -} -``` - -
- - -Note that the compiler will not generate errors when using predeclared -identifiers, but tools such as `go vet` should correctly point out these and -other cases of shadowing. - -### Avoid `init()` - -Avoid `init()` where possible. When `init()` is unavoidable or desirable, code -should attempt to: - -1. Be completely deterministic, regardless of program environment or invocation. -2. Avoid depending on the ordering or side-effects of other `init()` functions. While `init()` ordering is well-known, code can change, and thus relationships between `init()` functions can make code brittle and error-prone. -3. Avoid accessing or manipulating global or environment state, such as machine information, environment variables, working directory, program arguments/inputs, etc. -4. Avoid I/O, including both filesystem, network, and system calls. - -Code that cannot satisfy these requirements likely belongs as a helper to be -called as part of `main()` (or elsewhere in a program's lifecycle), or be -written as part of `main()` itself. In particular, libraries that are intended -to be used by other programs should take special care to be completely -deterministic and not perform "init magic". - - - - - - -
BadGood
- -```go -type Foo struct { - // ... -} - -var _defaultFoo Foo - -func init() { - _defaultFoo = Foo{ - // ... - } -} -``` - - - -```go -var _defaultFoo = Foo{ - // ... -} - -// or, better, for testability: - -var _defaultFoo = defaultFoo() - -func defaultFoo() Foo { - return Foo{ - // ... - } -} -``` - -
- -```go -type Config struct { - // ... -} - -var _config Config - -func init() { - // Bad: based on current directory - cwd, _ := os.Getwd() - - // Bad: I/O - raw, _ := ioutil.ReadFile( - path.Join(cwd, "config", "config.yaml"), - ) - - yaml.Unmarshal(raw, &_config) -} -``` - - - -```go -type Config struct { - // ... -} - -func loadConfig() Config { - cwd, err := os.Getwd() - // handle err - - raw, err := ioutil.ReadFile( - path.Join(cwd, "config", "config.yaml"), - ) - // handle err - - var config Config - yaml.Unmarshal(raw, &config) - - return config -} -``` - -
- -Considering the above, some situations in which `init()` may be preferable or -necessary might include: - -- Complex expressions that cannot be represented as single assignments. -- Pluggable hooks, such as `database/sql` dialects, encoding type registries, etc. -- Optimizations to [Google Cloud Functions] and other forms of deterministic - precomputation. - - [Google Cloud Functions]: https://cloud.google.com/functions/docs/bestpractices/tips#use_global_variables_to_reuse_objects_in_future_invocations - -### Exit in Main - -Go programs use [`os.Exit`] or [`log.Fatal*`] to exit immediately. (Panicking -is not a good way to exit programs, please [don't panic](#dont-panic).) - -[`os.Exit`]: https://golang.org/pkg/os/#Exit -[`log.Fatal*`]: https://golang.org/pkg/log/#Fatal - -Call one of `os.Exit` or `log.Fatal*` **only in `main()`**. All other -functions should return errors to signal failure. -Never, ever use `os.Exit` or `log.Fatal*` as a way to handle errors. -This should never pass code reviews or used as a "temporary" way to handle errors; errors are a natural occurrence, they are -not bugs and will always happen. Imagine what would happen, for example, if an HTTP request handling function handles errors through -`os.Exit` or `log.Fatal*`; you effectively enabled DOS attacks. - - - - - -
BadGood
- -```go -func main() { - body := readFile(path) - fmt.Println(body) -} - -func readFile(path string) string { - f, err := os.Open(path) - if err != nil { - log.Fatal(err) - } - - b, err := ioutil.ReadAll(f) - if err != nil { - log.Fatal(err) - } - - return string(b) -} -``` - - - -```go -func main() { - body, err := readFile(path) - if err != nil { - log.Fatal(err) - } - fmt.Println(body) -} - -func readFile(path string) (string, error) { - f, err := os.Open(path) - if err != nil { - return "", err - } - - b, err := ioutil.ReadAll(f) - if err != nil { - return "", err - } - - return string(b), nil -} -``` - -
- -Rationale: Programs with multiple functions that exit present a few issues: - -- Non-obvious control flow: Any function can exit the program so it becomes - difficult to reason about the control flow. -- Difficult to test: A function that exits the program will also exit the test - calling it. This makes the function difficult to test and introduces risk of - skipping other tests that have not yet been run by `go test`. -- Skipped cleanup: When a function exits the program, it skips function calls - enqueued with `defer` statements. This adds risk of skipping important - cleanup tasks. - -#### Exit Once - -If possible, prefer to call `os.Exit` or `log.Fatal` **at most once** in your -`main()`. If there are multiple error scenarios that halt program execution, -put that logic under a separate function and return errors from it. - -This has the effect of shortening your `main()` function and putting all key -business logic into a separate, testable function. - - - - - -
BadGood
- -```go -package main - -func main() { - args := os.Args[1:] - if len(args) != 1 { - log.Fatal("missing file") - } - name := args[0] - - f, err := os.Open(name) - if err != nil { - log.Fatal(err) - } - defer f.Close() - - // If we call log.Fatal after this line, - // f.Close will not be called. - - b, err := ioutil.ReadAll(f) - if err != nil { - log.Fatal(err) - } - - // ... -} -``` - - - -```go -package main - -func main() { - if err := run(); err != nil { - log.Fatal(err) - } -} - -func run() error { - args := os.Args[1:] - if len(args) != 1 { - return errors.New("missing file") - } - name := args[0] - - f, err := os.Open(name) - if err != nil { - return err - } - defer f.Close() - - b, err := ioutil.ReadAll(f) - if err != nil { - return err - } - - // ... -} -``` - -
- -## Performance - -Performance-specific guidelines apply only to the hot path. - -### Prefer strconv over fmt - -When converting primitives to/from strings, `strconv` is faster than -`fmt`. - - - - - - -
BadGood
- -```go -for i := 0; i < b.N; i++ { - s := fmt.Sprint(rand.Int()) -} -``` - - - -```go -for i := 0; i < b.N; i++ { - s := strconv.Itoa(rand.Int()) -} -``` - -
- -``` -BenchmarkFmtSprint-4 143 ns/op 2 allocs/op -``` - - - -``` -BenchmarkStrconv-4 64.2 ns/op 1 allocs/op -``` - -
- -### Avoid string-to-byte conversion - -Do not create byte slices from a fixed string repeatedly. Instead, perform the -conversion once and capture the result. - - - - - - -
BadGood
- -```go -for i := 0; i < b.N; i++ { - w.Write([]byte("Hello world")) -} -``` - - - -```go -data := []byte("Hello world") -for i := 0; i < b.N; i++ { - w.Write(data) -} -``` - -
- -``` -BenchmarkBad-4 50000000 22.2 ns/op -``` - - - -``` -BenchmarkGood-4 500000000 3.25 ns/op -``` - -
- -### Prefer Specifying Container Capacity - -Specify container capacity where possible in order to allocate memory for the -container up front. This minimizes subsequent allocations (by copying and -resizing of the container) as elements are added. - -#### Specifying Map Capacity Hints - -Where possible, provide capacity hints when initializing -maps with `make()`. - -```go -make(map[T1]T2, hint) -``` - -Providing a capacity hint to `make()` tries to right-size the -map at initialization time, which reduces the need for growing -the map and allocations as elements are added to the map. - -Note that, unlike slices, map capacity hints do not guarantee complete, -preemptive allocation, but are used to approximate the number of hashmap buckets -required. Consequently, allocations may still occur when adding elements to the -map, even up to the specified capacity. - - - - - - -
BadGood
- -```go -m := make(map[string]os.FileInfo) - -files, _ := ioutil.ReadDir("./files") -for _, f := range files { - m[f.Name()] = f -} -``` - - - -```go - -files, _ := ioutil.ReadDir("./files") - -m := make(map[string]os.FileInfo, len(files)) -for _, f := range files { - m[f.Name()] = f -} -``` - -
- -`m` is created without a size hint; there may be more -allocations at assignment time. - - - -`m` is created with a size hint; there may be fewer -allocations at assignment time. - -
- -#### Specifying Slice Capacity - -Where possible, provide capacity hints when initializing slices with `make()`, -particularly when appending. - -```go -make([]T, length, capacity) -``` - -Unlike maps, slice capacity is not a hint: the compiler will allocate enough -memory for the capacity of the slice as provided to `make()`, which means that -subsequent `append()` operations will incur zero allocations (until the length -of the slice matches the capacity, after which any appends will require a resize -to hold additional elements). - - - - - - -
BadGood
- -```go -for n := 0; n < b.N; n++ { - data := make([]int, 0) - for k := 0; k < size; k++{ - data = append(data, k) - } -} -``` - - - -```go -for n := 0; n < b.N; n++ { - data := make([]int, 0, size) - for k := 0; k < size; k++{ - data = append(data, k) - } -} -``` - -
- -``` -BenchmarkBad-4 100000000 2.48s -``` - - - -``` -BenchmarkGood-4 100000000 0.21s -``` - -
- -## Style - -### Avoid overly long lines - -Avoid lines of code that require readers to scroll horizontally -or turn their heads too much. - -We recommend a soft line length limit of **99 characters**. -Authors should aim to wrap lines before hitting this limit, -but it is not a hard limit. -Code is allowed to exceed this limit. - -### Be Consistent - -Some of the guidelines outlined in this document can be evaluated objectively; -others are situational, contextual, or subjective. - -Above all else, **be consistent**. - -Consistent code is easier to maintain, is easier to rationalize, requires less -cognitive overhead, and is easier to migrate or update as new conventions emerge -or classes of bugs are fixed. - -Conversely, having multiple disparate or conflicting styles within a single -codebase causes maintenance overhead, uncertainty, and cognitive dissonance, -all of which can directly contribute to lower velocity, painful code reviews, -and bugs. - -When applying these guidelines to a codebase, it is recommended that changes -are made at a package (or larger) level: application at a sub-package level -violates the above concern by introducing multiple styles into the same code. - -### Avoid side-effects - -**Don't:** -```go -func init() { - someStruct.Load() -} -``` - -Side effects are only okay in special cases (e.g. parsing flags in a cmd). -If you find no other way, rethink and refactor. - -### Favour pure functions - -> In computer programming, a function may be considered a pure function if both of the following statements about the function hold: -> 1. The function always evaluates the same result value given the same argument value(s). The function result value cannot depend on any hidden information or state that may change while program execution proceeds or between different executions of the program, nor can it depend on any external input from I/O devices. -> 2. Evaluation of the result does not cause any semantically observable side effect or output, such as mutation of mutable objects or output to I/O devices. - -– [Wikipedia](https://en.wikipedia.org/wiki/Pure_function) - - - - - -
BadGood
- -```go -func MarshalAndWrite(some *Thing) error { - b, err := json.Marshal(some) - if err != nil { - return err - } - - return ioutil.WriteFile("some.thing", b, 0644) -} -``` - - - -```go -// Marshal is a pure func (even though useless) -func Marshal(some *Thing) ([]bytes, error) { - return json.Marshal(some) -} - -// ... -``` - -
- -This is obviously not possible at all times, but trying to make every possible -func pure makes code more understandable and improves debugging. - -### Don't over-interface - - - - - -
BadGood
- -```go -type Server interface { - Serve() error - Some() int - Fields() float64 - That() string - Are([]byte) error - Not() []string - Necessary() error -} - -func debug(srv Server) { - fmt.Println(srv.String()) -} - -func run(srv Server) { - srv.Serve() -} -``` - - - -```go -type Server interface { - Serve() error -} - -func debug(v fmt.Stringer) { - fmt.Println(v.String()) -} - -func run(srv Server) { - srv.Serve() -} -``` - -
- -Favour small interfaces and only expect the interfaces you need in your functions. - -### Don't under-package - -Deleting or merging packages is far easier than splitting big ones up. -When unsure if a package can be split, do it. - -### Keep the happy path left - - - - - -
BadGood
- -```go -if item, ok := someMap[someKey]; ok { - return item -} -return ErrKeyNotFound -``` - - - -```go -item, ok := someMap[someKey] -if !ok { - return ErrKeyNotFound -} -return item -``` - -
- -This helps to keep your code clear and readable. -Not doing it accumulates in larger functions and leads to the happy path being buried in a lot of if/for/... statements. - -### Group Similar Declarations - -Go supports grouping similar declarations. - - - - - -
BadGood
- -```go -import "a" -import "b" -``` - - - -```go -import ( - "a" - "b" -) -``` - -
- -This also applies to constants, variables, and type declarations. - - - - - -
BadGood
- -```go - -const a = 1 -const b = 2 - - - -var a = 1 -var b = 2 - - - -type Area float64 -type Volume float64 -``` - - - -```go -const ( - a = 1 - b = 2 -) - -var ( - a = 1 - b = 2 -) - -type ( - Area float64 - Volume float64 -) -``` - -
- -Only group related declarations. Do not group declarations that are unrelated. - - - - - -
BadGood
- -```go -type Operation int - -const ( - Add Operation = iota + 1 - Subtract - Multiply - EnvVar = "MY_ENV" -) -``` - - - -```go -type Operation int - -const ( - Add Operation = iota + 1 - Subtract - Multiply -) - -const EnvVar = "MY_ENV" -``` - -
- -Groups are not limited in where they can be used. For example, you can use them -inside of functions. - - - - - -
BadGood
- -```go -func f() string { - red := color.New(0xff0000) - green := color.New(0x00ff00) - blue := color.New(0x0000ff) - - // ... -} -``` - - - -```go -func f() string { - var ( - red = color.New(0xff0000) - green = color.New(0x00ff00) - blue = color.New(0x0000ff) - ) - - // ... -} -``` - -
- -Exception: Variable declarations, particularly inside functions, should be -grouped together if declared adjacent to other variables. Do this for variables -declared together even if they are unrelated. - - - - - -
BadGood
- -```go -func (c *client) request() { - caller := c.name - format := "json" - timeout := 5*time.Second - var err error - - // ... -} -``` - - - -```go -func (c *client) request() { - var ( - caller = c.name - format = "json" - timeout = 5*time.Second - err error - ) - - // ... -} -``` - -
- -### Import Group Ordering - -There should be two import groups: - -- Standard library -- Everything else - -This is the grouping applied by goimports by default. - - - - - -
BadGood
- -```go -import ( - "fmt" - "os" - "go.uber.org/atomic" - "golang.org/x/sync/errgroup" -) -``` - - - -```go -import ( - "fmt" - "os" - - "go.uber.org/atomic" - "golang.org/x/sync/errgroup" -) -``` - -
- -### Package Names - -When naming packages, choose a name that is: - -- All lower-case. No capitals or underscores. -- Does not need to be renamed using named imports at most call sites. -- Short and succinct. Remember that the name is identified in full at every call - site. -- Not plural. For example, `net/url`, not `net/urls`. -- Not "common", "util", "shared", or "lib". These are bad, uninformative names. - -See also [Package Names] and [Style guideline for Go packages]. - -[Package Names]: https://blog.golang.org/package-names -[Style guideline for Go packages]: https://rakyll.org/style-packages/ - -### Function Names - -We follow the Go community's convention of using [MixedCaps for function -names]. An exception is made for test functions, which may contain underscores -for the purpose of grouping related test cases, e.g., -`TestMyFunction_WhatIsBeingTested`. - -[MixedCaps for function names]: https://golang.org/doc/effective_go.html#mixed-caps - -### Import Aliasing - -Import aliasing must be used if the package name does not match the last -element of the import path. - -```go -import ( - "net/http" - - client "example.com/client-go" - trace "example.com/trace/v2" -) -``` - -In all other scenarios, import aliases should be avoided unless there is a -direct conflict between imports. - - - - - -
BadGood
- -```go -import ( - "fmt" - "os" - - - nettrace "golang.net/x/trace" -) -``` - - - -```go -import ( - "fmt" - "os" - "runtime/trace" - - nettrace "golang.net/x/trace" -) -``` - -
- -### Function Grouping and Ordering - -- Functions should be sorted in rough call order. -- Functions in a file should be grouped by receiver. - -Therefore, exported functions should appear first in a file, after -`struct`, `const`, `var` definitions. - -A `newXYZ()`/`NewXYZ()` may appear after the type is defined, but before the -rest of the methods on the receiver. - -Since functions are grouped by receiver, plain utility functions should appear -towards the end of the file. - - - - - -
BadGood
- -```go -func (s *something) Cost() { - return calcCost(s.weights) -} - -type something struct{ ... } - -func calcCost(n []int) int {...} - -func (s *something) Stop() {...} - -func newSomething() *something { - return &something{} -} -``` - - - -```go -type something struct{ ... } - -func newSomething() *something { - return &something{} -} - -func (s *something) Cost() { - return calcCost(s.weights) -} - -func (s *something) Stop() {...} - -func calcCost(n []int) int {...} -``` - -
- -### Reduce Nesting - -Code should reduce nesting where possible by handling error cases/special -conditions first and returning early or continuing the loop. Reduce the amount -of code that is nested multiple levels. - - - - - -
BadGood
- -```go -for _, v := range data { - if v.F1 == 1 { - v = process(v) - if err := v.Call(); err == nil { - v.Send() - } else { - return err - } - } else { - log.Printf("Invalid v: %v", v) - } -} -``` - - - -```go -for _, v := range data { - if v.F1 != 1 { - log.Printf("Invalid v: %v", v) - continue - } - - v = process(v) - if err := v.Call(); err != nil { - return err - } - v.Send() -} -``` - -
- -### Unnecessary Else - -If a variable is set in both branches of an if, it can be replaced with a -single if. - - - - - -
BadGood
- -```go -var a int -if b { - a = 100 -} else { - a = 10 -} -``` - - - -```go -a := 10 -if b { - a = 100 -} -``` - -
- -### Top-level Variable Declarations - -At the top level, use the standard `var` keyword. Do not specify the type, -unless it is not the same type as the expression. - - - - - -
BadGood
- -```go -var _s string = F() - -func F() string { return "A" } -``` - - - -```go -var _s = F() -// Since F already states that it returns a string, we don't need to specify -// the type again. - -func F() string { return "A" } -``` - -
- -Specify the type if the type of the expression does not match the desired type -exactly. - -```go -type myError struct{} - -func (myError) Error() string { return "error" } - -func F() myError { return myError{} } - -var _e error = F() -// F returns an object of type myError but we want error. -``` - -### Embedding in Structs - -Embedded types should be at the top of the field list of a -struct, and there must be an empty line separating embedded fields from regular -fields. - - - - - -
BadGood
- -```go -type Client struct { - version int - http.Client -} -``` - - - -```go -type Client struct { - http.Client - - version int -} -``` - -
- -Embedding should provide tangible benefit, like adding or augmenting -functionality in a semantically-appropriate way. It should do this with zero -adverse user-facing effects (see also: [Avoid Embedding Types in Public Structs]). - -Exception: Mutexes should not be embedded, even on unexported types. See also: [Zero-value Mutexes are Valid]. - -[Avoid Embedding Types in Public Structs]: #avoid-embedding-types-in-public-structs -[Zero-value Mutexes are Valid]: #zero-value-mutexes-are-valid - -Embedding **should not**: - -- Be purely cosmetic or convenience-oriented. -- Make outer types more difficult to construct or use. -- Affect outer types' zero values. If the outer type has a useful zero value, it - should still have a useful zero value after embedding the inner type. -- Expose unrelated functions or fields from the outer type as a side-effect of - embedding the inner type. -- Expose unexported types. -- Affect outer types' copy semantics. -- Change the outer type's API or type semantics. -- Embed a non-canonical form of the inner type. -- Expose implementation details of the outer type. -- Allow users to observe or control type internals. -- Change the general behavior of inner functions through wrapping in a way that - would reasonably surprise users. - -Simply put, embed consciously and intentionally. A good litmus test is, "would -all of these exported inner methods/fields be added directly to the outer type"; -if the answer is "some" or "no", don't embed the inner type - use a field -instead. - - - - - - - -
BadGood
- -```go -type A struct { - // Bad: A.Lock() and A.Unlock() are - // now available, provide no - // functional benefit, and allow - // users to control details about - // the internals of A. - sync.Mutex -} -``` - - - -```go -type countingWriteCloser struct { - // Good: Write() is provided at this - // outer layer for a specific - // purpose, and delegates work - // to the inner type's Write(). - io.WriteCloser - - count int -} - -func (w *countingWriteCloser) Write(bs []byte) (int, error) { - w.count += len(bs) - return w.WriteCloser.Write(bs) -} -``` - -
- -```go -type Book struct { - // Bad: pointer changes zero value usefulness - io.ReadWriter - - // other fields -} - -// later - -var b Book -b.Read(...) // panic: nil pointer -b.String() // panic: nil pointer -b.Write(...) // panic: nil pointer -``` - - - -```go -type Book struct { - // Good: has useful zero value - bytes.Buffer - - // other fields -} - -// later - -var b Book -b.Read(...) // ok -b.String() // ok -b.Write(...) // ok -``` - -
- -```go -type Client struct { - sync.Mutex - sync.WaitGroup - bytes.Buffer - url.URL -} -``` - - - -```go -type Client struct { - mtx sync.Mutex - wg sync.WaitGroup - buf bytes.Buffer - url url.URL -} -``` - -
- -### Local Variable Declarations - -Short variable declarations (`:=`) should be used if a variable is being set to -some value explicitly. - - - - - -
BadGood
- -```go -var s = "foo" -``` - - - -```go -s := "foo" -``` - -
- -However, there are cases where the default value is clearer when the `var` -keyword is used. [Declaring Empty Slices], for example. - -[Declaring Empty Slices]: https://github.com/golang/go/wiki/CodeReviewComments#declaring-empty-slices - - - - - -
BadGood
- -```go -func f(list []int) { - filtered := []int{} - for _, v := range list { - if v > 10 { - filtered = append(filtered, v) - } - } -} -``` - - - -```go -func f(list []int) { - var filtered []int - for _, v := range list { - if v > 10 { - filtered = append(filtered, v) - } - } -} -``` - -
- -### nil is a valid slice - -`nil` is a valid slice of length 0. This means that, - -- You should not return a slice of length zero explicitly. Return `nil` - instead. - - - - - -
BadGood
- - ```go - if x == "" { - return []int{} - } - ``` - - - - ```go - if x == "" { - return nil - } - ``` - -
- -- To check if a slice is empty, always use `len(s) == 0`. Do not check for - `nil`. - - - - - -
BadGood
- - ```go - func isEmpty(s []string) bool { - return s == nil - } - ``` - - - - ```go - func isEmpty(s []string) bool { - return len(s) == 0 - } - ``` - -
- - - The zero value (a slice declared with `var`) is usable immediately without - `make()`. - - - - - -
BadGood
- - ```go - nums := []int{} - // or, nums := make([]int) - - if add1 { - nums = append(nums, 1) - } - - if add2 { - nums = append(nums, 2) - } - ``` - - - - ```go - var nums []int - - if add1 { - nums = append(nums, 1) - } - - if add2 { - nums = append(nums, 2) - } - ``` - -
- -Remember that, while it is a valid slice, a nil slice is not equivalent to an -allocated slice of length 0 - one is nil and the other is not - and the two may -be treated differently in different situations (such as serialization). - -### Reduce Scope of Variables - -Where possible, reduce scope of variables. Do not reduce the scope if it -conflicts with [Reduce Nesting](#reduce-nesting). - - - - - -
BadGood
- -```go -err := ioutil.WriteFile(name, data, 0644) -if err != nil { - return err -} -``` - - - -```go -if err := ioutil.WriteFile(name, data, 0644); err != nil { - return err -} -``` - -
- -If you need a result of a function call outside of the if, then you should not -try to reduce the scope. - - - - - -
BadGood
- -```go -if data, err := ioutil.ReadFile(name); err == nil { - err = cfg.Decode(data) - if err != nil { - return err - } - - fmt.Println(cfg) - return nil -} else { - return err -} -``` - - - -```go -data, err := ioutil.ReadFile(name) -if err != nil { - return err -} - -if err := cfg.Decode(data); err != nil { - return err -} - -fmt.Println(cfg) -return nil -``` - -
- -### Avoid Naked Parameters - -Naked parameters in function calls can hurt readability. -A valid alternative to naked parameters is to replace naked `bool` types with custom types for more readable and -type-safe code. This allows more than just two states (true/false) for that -parameter in the future. - -```go -type Region int - -const ( - UnknownRegion Region = iota - Local -) - -type Status int - -const ( - StatusReady Status = iota + 1 - StatusDone - // Maybe we will have a StatusInProgress in the future. -) - -func printInfo(name string, region Region, status Status) -``` - -### Use Raw String Literals to Avoid Escaping - -Go supports [raw string literals](https://golang.org/ref/spec#raw_string_lit), -which can span multiple lines and include quotes. Use these to avoid -hand-escaped strings which are much harder to read. - - - - - -
BadGood
- -```go -wantError := "unknown name:\"test\"" -``` - - - -```go -wantError := `unknown error:"test"` -``` - -
- -### Initializing Structs - -#### Use Field Names to Initialize Structs - -You should almost always specify field names when initializing structs. This is -now enforced by [`go vet`]. - -[`go vet`]: https://golang.org/cmd/vet/ - - - - - -
BadGood
- -```go -k := User{"John", "Doe", true} -``` - - - -```go -k := User{ - FirstName: "John", - LastName: "Doe", - Admin: true, -} -``` - -
- -Exception: Field names *may* be omitted in test tables when there are 3 or -fewer fields. - -```go -tests := []struct{ - op Operation - want string -}{ - {Add, "add"}, - {Subtract, "subtract"}, -} -``` - -#### Omit Zero Value Fields in Structs - -When initializing structs with field names, omit fields that have zero values -unless they provide meaningful context. Otherwise, let Go set these to zero -values automatically. - - - - - -
BadGood
- -```go -user := User{ - FirstName: "John", - LastName: "Doe", - MiddleName: "", - Admin: false, -} -``` - - - -```go -user := User{ - FirstName: "John", - LastName: "Doe", -} -``` - -
- -This helps reduce noise for readers by omitting values that are default in -that context. Only meaningful values are specified. - -Include zero values where field names provide meaningful context. For example, -test cases in [Test Tables](#test-tables) can benefit from names of fields -even when they are zero-valued. - -```go -tests := []struct{ - give string - want int -}{ - {give: "0", want: 0}, - // ... -} -``` - -#### Use `var` for Zero Value Structs - -When all the fields of a struct are omitted in a declaration, use the `var` -form to declare the struct. - - - - - -
BadGood
- -```go -user := User{} -``` - - - -```go -var user User -``` - -
- -This differentiates zero valued structs from those with non-zero fields -similar to the distinction created for [map initialization], and matches how -we prefer to [declare empty slices][Declaring Empty Slices]. - -[map initialization]: #initializing-maps - -#### Initializing Struct References - -Use `&T{}` instead of `new(T)` when initializing struct references so that it -is consistent with the struct initialization. - - - - - -
BadGood
- -```go -sval := T{Name: "foo"} - -// inconsistent -sptr := new(T) -sptr.Name = "bar" -``` - - - -```go -sval := T{Name: "foo"} - -sptr := &T{Name: "bar"} -``` - -
- -### Initializing Maps - -Prefer `make(..)` for empty maps, and maps populated -programmatically. This makes map initialization visually -distinct from declaration, and it makes it easy to add size -hints later if available. - - - - - - -
BadGood
- -```go -var ( - // m1 is safe to read and write; - // m2 will panic on writes. - m1 = map[T1]T2{} - m2 map[T1]T2 -) -``` - - - -```go -var ( - // m1 is safe to read and write; - // m2 will panic on writes. - m1 = make(map[T1]T2) - m2 map[T1]T2 -) -``` - -
- -Declaration and initialization are visually similar. - - - -Declaration and initialization are visually distinct. - -
- -Where possible, provide capacity hints when initializing -maps with `make()`. See -[Specifying Map Capacity Hints](#specifying-map-capacity-hints) -for more information. - -On the other hand, if the map holds a fixed list of elements, -use map literals to initialize the map. - - - - - -
BadGood
- -```go -m := make(map[T1]T2, 3) -m[k1] = v1 -m[k2] = v2 -m[k3] = v3 -``` - - - -```go -m := map[T1]T2{ - k1: v1, - k2: v2, - k3: v3, -} -``` - -
- - -The basic rule of thumb is to use map literals when adding a fixed set of -elements at initialization time, otherwise use `make` (and specify a size hint -if available). - -## Patterns - -### Test Tables - -Use table-driven tests with [subtests] to avoid duplicating code when the core -test logic is repetitive. - -[subtests]: https://blog.golang.org/subtests - - - - - -
BadGood
- -```go -// func TestSplitHostPort(t *testing.T) - -host, port, err := net.SplitHostPort("192.0.2.0:8000") -require.NoError(t, err) -assert.Equal(t, "192.0.2.0", host) -assert.Equal(t, "8000", port) - -host, port, err = net.SplitHostPort("192.0.2.0:http") -require.NoError(t, err) -assert.Equal(t, "192.0.2.0", host) -assert.Equal(t, "http", port) - -host, port, err = net.SplitHostPort(":8000") -require.NoError(t, err) -assert.Equal(t, "", host) -assert.Equal(t, "8000", port) - -host, port, err = net.SplitHostPort("1:8") -require.NoError(t, err) -assert.Equal(t, "1", host) -assert.Equal(t, "8", port) -``` - - - -```go -// func TestSplitHostPort(t *testing.T) - -tests := []struct{ - give string - wantHost string - wantPort string -}{ - { - give: "192.0.2.0:8000", - wantHost: "192.0.2.0", - wantPort: "8000", - }, - { - give: "192.0.2.0:http", - wantHost: "192.0.2.0", - wantPort: "http", - }, - { - give: ":8000", - wantHost: "", - wantPort: "8000", - }, - { - give: "1:8", - wantHost: "1", - wantPort: "8", - }, -} - -for _, tt := range tests { - t.Run(tt.give, func(t *testing.T) { - host, port, err := net.SplitHostPort(tt.give) - require.NoError(t, err) - assert.Equal(t, tt.wantHost, host) - assert.Equal(t, tt.wantPort, port) - }) -} -``` - -
- -Test tables make it easier to add context to error messages, reduce duplicate -logic, and add new test cases. - -We follow the convention that the slice of structs is referred to as `tests` -and each test case `tt`. Further, we encourage explicating the input and output -values for each test case with `give` and `want` prefixes. - -```go -tests := []struct{ - give string - wantHost string - wantPort string -}{ - // ... -} - -for _, tt := range tests { - // ... -} -``` - -### Functional Options - -Functional options is a pattern in which you declare an opaque `Option` type -that records information in some internal struct. You accept a variadic number -of these options and act upon the full information recorded by the options on -the internal struct. - -Use this pattern for optional arguments in constructors and other public APIs -that you foresee needing to expand, especially if you already have three or -more arguments on those functions. - - - - - - -
BadGood
- -```go -// package db - -func Open( - addr string, - cache bool, - logger *zap.Logger -) (*Connection, error) { - // ... -} -``` - - - -```go -// package db - -type Option func(*Settings) { - // ... -} - -func WithCache(c bool) Option { - // ... -} - -func WithLogger(log *zap.Logger) Option { - // ... -} - -// Open creates a connection. -func Open( - addr string, - opts ...Option, -) (*Connection, error) { - // ... -} -``` - -
- -The cache and logger parameters must always be provided, even if the user -wants to use the default. - -```go -db.Open(addr, db.DefaultCache, zap.NewNop()) -db.Open(addr, db.DefaultCache, log) -db.Open(addr, false /* cache */, zap.NewNop()) -db.Open(addr, false /* cache */, log) -``` - - - -Options are provided only if needed. - -```go -db.Open(addr) -db.Open(addr, db.WithLogger(log)) -db.Open(addr, db.WithCache(false)) -db.Open( - addr, - db.WithCache(false), - db.WithLogger(log), -) -``` - -
- -See also: - -- [Self-referential functions and the design of options] -- [Functional options for friendly APIs] - - [Self-referential functions and the design of options]: https://commandcenter.blogspot.com/2014/01/self-referential-functions-and-design.html - [Functional options for friendly APIs]: https://dave.cheney.net/2014/10/17/functional-options-for-friendly-apis - -## Linting - -More importantly than any "blessed" set of linters, lint consistently across a -codebase. - -We recommend using the following linters at a minimum, because we feel that they -help to catch the most common issues and also establish a high bar for code -quality without being unnecessarily prescriptive: - -- [errcheck] to ensure that errors are handled -- [goimports] to format code and manage imports -- [golint] to point out common style mistakes -- [govet] to analyze code for common mistakes -- [staticcheck] to do various static analysis checks - - [errcheck]: https://github.com/kisielk/errcheck - [goimports]: https://godoc.org/golang.org/x/tools/cmd/goimports - [golint]: https://github.com/golang/lint - [govet]: https://golang.org/cmd/vet/ - [staticcheck]: https://staticcheck.io/ - - -### Lint Runners - -We recommend [golangci-lint] as the go-to lint runner for Go code, largely due -to its performance in larger codebases and ability to configure and use many -canonical linters at once. This repo has an example [.golangci.yaml](.golangci.yaml) config file -with recommended linters and settings. - -golangci-lint has [various linters] available for use. The above linters are -recommended as a base set, and we encourage teams to add any additional linters -that make sense for their projects. - -[golangci-lint]: https://github.com/golangci/golangci-lint -[various linters]: https://golangci-lint.run/usage/linters/ diff --git a/dataphos-docs/.hugo_build.lock b/dataphos-docs/.hugo_build.lock new file mode 100644 index 0000000..e69de29 diff --git a/dataphos-docs/Dockerfile b/dataphos-docs/Dockerfile new file mode 100644 index 0000000..11e48e3 --- /dev/null +++ b/dataphos-docs/Dockerfile @@ -0,0 +1,13 @@ +FROM klakegg/hugo:0.101.0-ext-alpine + +WORKDIR /app + +COPY . . + +WORKDIR ./themes/hugo-geekdoc +RUN npm install +RUN npm run build + +WORKDIR /app + +CMD ["server"] \ No newline at end of file diff --git a/dataphos-docs/archetypes/default.md b/dataphos-docs/archetypes/default.md new file mode 100644 index 0000000..00e77bd --- /dev/null +++ b/dataphos-docs/archetypes/default.md @@ -0,0 +1,6 @@ +--- +title: "{{ replace .Name "-" " " | title }}" +date: {{ .Date }} +draft: true +--- + diff --git a/dataphos-docs/config.toml b/dataphos-docs/config.toml new file mode 100644 index 0000000..7b19d84 --- /dev/null +++ b/dataphos-docs/config.toml @@ -0,0 +1,31 @@ +baseURL = "/" +title = "DATAPHOS" +theme = "hugo-geekdoc" + +googleAnalytics = "G-050NNGSD05" +pluralizeListTitles = false + +# Geekdoc required configuration +pygmentsUseClasses = true +pygmentsCodeFences = true +disablePathToLower = true +enableGitInfo = false + +# Required if you want to render robots.txt template +enableRobotsTXT = true + +# Needed for mermaid shortcodes +[markup] + [markup.goldmark.renderer] + # Needed for mermaid shortcode + unsafe = true + [markup.tableOfContents] + startLevel = 1 + endLevel = 9 + +[taxonomies] + tag = "tags" + +[params] + geekdocLogo = "brand.png" + \ No newline at end of file diff --git a/dataphos-docs/content/Contact/_index.md b/dataphos-docs/content/Contact/_index.md new file mode 100644 index 0000000..0e23229 --- /dev/null +++ b/dataphos-docs/content/Contact/_index.md @@ -0,0 +1,18 @@ +--- +title: "Contact" +draft: false +weight: 6 +--- + +Welcome to the Dataphos Contact Form! We're delighted to hear from you and eager to assist with any questions, inquiries, or feedback you may have. Your thoughts are of the utmost importance, and we're here to ensure your experience is seamless and your concerns are addressed promptly. Please feel free to reach out through any of the following contact methods: + +- [Contact Form]({{< ref "contact" >}} "Contact Form") +- Email: **dataphos@syntio.net** +- Syntio website: [link](https://www.syntio.net?utm_source=DataphosDocs&utm_medium=Documentation&utm_campaign=DataphosDocs/) +- Syntio contact form for other business inqueries: [link](https://www.syntio.net/en/contact?utm_source=DataphosDocs&utm_medium=Documentation&utm_campaign=DataphosDocs) +- Social Media: Connect with us on [LinkedIn](https://www.linkedin.com/company/syntio/) or [Instagram](https://www.instagram.com/syntio.data.engineering/?hl=en) + + +We look forward to connecting with you! + +{{< toc-tree >}} \ No newline at end of file diff --git a/dataphos-docs/content/Contact/form.md b/dataphos-docs/content/Contact/form.md new file mode 100644 index 0000000..4dfcbd3 --- /dev/null +++ b/dataphos-docs/content/Contact/form.md @@ -0,0 +1,8 @@ +--- +title: "Form" +draft: false +--- +{{< rawhtml >}} + +{{< /rawhtml >}} + diff --git a/dataphos-docs/content/_index.md b/dataphos-docs/content/_index.md new file mode 100644 index 0000000..121d840 --- /dev/null +++ b/dataphos-docs/content/_index.md @@ -0,0 +1,20 @@ +--- +title: "Dataphos" +draft: false +--- +![](/dataphos.png) + +Welcome to the official documentation of **Dataphos**. **Dataphos** is a collection of microservices designed to solve the common issues arising in the current world of Data Engineering. Although each component is designed and developed independently to support the specific needs of existing architectures, **Dataphos** is built to inevitably come together as a unified **ingestion platform**, guiding the journey of your data from on-premise systems into the cloud and beyond. + +Started in 2020 by battle-hardened industry professionals, **Dataphos** was built as a solution to the common issues plaguing **Event-Driven Architectures**: + +* How do you transfer data from an on-premise database as a collection of **structured records** without relying on CDC solutions to merely replicate the database row-by-row? +* How do you ensure that data consumers don't break due to sudden changes in the underlying schema of the data? +* How do you ensure proper fail-saves exist while using streaming services? How do you replay messages? +* How can you quickly and efficiently build structured Data Lake architectures? + +At its core, **Dataphos** is an accelerator, enabling your engineers to focus on delivering business value, instead of re-building the same patterns and solutions other businesses spent months or years struggling with. + +This documentation will provide a general overview of the key Dataphos use-cases, allow you to quickly set up an environment to deploy individual Dataphos components in (or help you deploy them in already-existing ones with minimal hassle), and provide in-depth information on each individual component's configuration. + +{{< toc-tree >}} \ No newline at end of file diff --git a/dataphos-docs/content/persistor/_index.md b/dataphos-docs/content/persistor/_index.md new file mode 100644 index 0000000..31023a5 --- /dev/null +++ b/dataphos-docs/content/persistor/_index.md @@ -0,0 +1,9 @@ +--- +title: "Persistor" +draft: false +weight: 1 +--- + +The Persistor is a stateless component designed to store messages procured from a topic within a data pipeline, providing a seamless interface for their retrieval and potential resubmission to a topic. Acting as a failsafe, Persistor establishes a connection to a message broker through a subscription, methodically archiving messages either individually or in batches. + +{{< toc-tree >}} diff --git a/dataphos-docs/content/persistor/configuration/_index.md b/dataphos-docs/content/persistor/configuration/_index.md new file mode 100644 index 0000000..179bbbb --- /dev/null +++ b/dataphos-docs/content/persistor/configuration/_index.md @@ -0,0 +1,25 @@ +--- +title: "Deployment Customization" +draft: false +weight: 4 +geekdocCollapseSection: true +--- + +This page describes Persistor architecture. Whereas the [Quickstart](/persistor/quickstart) will get you started fairly quickly, this page will explain more precisely the individual components being deployed, how they interact and how to configure them. The following pages go into further detail on how to customize your Kubernetes deployments: +{{< toc-tree >}} + + +# Persistor Architecture + +The following diagram gives an overview of the individual Persistor components and how they interact with your underlying Kubernetes environment: + +![Architecture](/persistor_arch.png) + +When deploying the Persistor, you deploy the following components: + +* The **Persistor Core** component, responsible for attaching itself to the streaming service and persisting the data to blob storage. +* The **Indexer Database** that will be used to track where messages are located across your Data Lake. +* The **Indexer** component, responsible for storing the metadata in the Indexer Database. +* The **Indexer API**, responsible for querying the **Indexer Database**. +* The **Resubmitter**, responsible for resubmitting the messages by querying the Indexer Database, retrieving the data based on the found locations and resubmitting it to the target broker. + diff --git a/dataphos-docs/content/persistor/configuration/helm.md b/dataphos-docs/content/persistor/configuration/helm.md new file mode 100644 index 0000000..3efc722 --- /dev/null +++ b/dataphos-docs/content/persistor/configuration/helm.md @@ -0,0 +1,128 @@ +--- +title: "Helm" +draft: false +weight: 2 +--- + +# Configuration + +Persistor is deployed as a set of Kubernetes resources, each of which is highly configurable. + +The tables below contain the variables that can be configured as part of the Persistor's configuration. + +## Persistor Core + +Below are the variables used to configure the main Persistor component. The broker-specific configuration options should be taken into consideration along with the "Common" variables. + +### Configuration {#reference_persistor_general} + +Below is the list of configurable options in the `values.yaml` file. + +| Variable | Type | Description | Default | +|-----------------------|--------|---------------------------------------------------------------------------|---------------------------------------------------| +| namespace | string | The namespace to deploy the Schema Registry into. | `dataphos` | +| images | object | Docker images to use for each of the individual Persistor sub-components. | | +| images.persistor | string | The Persistor image. | `syntioinc/dataphos-persistor-core:1.0.0` | +| images.indexer | string | The Indexer image. | `syntioinc/dataphos-persistor-indexer:1.0.0` | +| images.indexerApi | string | The Indexer API image. | `syntioinc/dataphos-persistor-indexer-api:1.0.0` | +| images.indexerMongoDb | string | The Mongo image to be used by the indexer. | `mongo:4.0` | +| images.resubmitter | string | The Resubmitter image. + +### Broker Configuration + +The `values.yaml` file contains a `brokers` object used to set up the key references to be used by the validators to connect to one or more brokers deemed as part of the overall platform infrastructure. + +| Variable | Type | Description | Applicable If | +|------------------------------------|--------|--------------------------------------------------------------------------------------------------------------------|------------------------| +| brokers | object | The object containing the general information on the brokers the Persistor service(s) will want to associate with. | | +| brokers.BROKER_ID | object | The object representing an individual broker's configuration. | | +| brokers.BROKER_ID.type | string | Denotes the broker's type. | | +| brokers.BROKER_ID.connectionString | string | The Azure Service Bus Namespace connection string. | `type` == `servicebus` | +| brokers.BROKER_ID.projectID | string | The GCP project ID. | `type` == `pubsub` | +| brokers.BROKER_ID.brokerAddr | string | The Kafka bootstrap server address. | `type` == `kafka` | + +### Storage Configuration + +The `values.yaml` file contains a `storage` object used to set up the key references to be used by the Persistor components to connect to one or more storage destination deemed as part of the overall platform infrastructure. + +| Variable | Type | Description | Applicable If | +|-------------------------------------|--------|-----------------------------------------------------------------------------------------------------------------------------|-------------------| +| storage | object | The object containing the general information on the storage services the Persistor service(s) will want to associate with. | | +| storage.STORAGE_ID | object | The object representing an individual storage destination configuration. | | +| storage.STORAGE_ID.type | string | Denotes the storage type. | | +| storage.STORAGE_ID.accountStorageID | string | The Azure Storage Account name. | `type` == `abs` | +| storage.STORAGE_ID.projectID | string | The GCP project ID. | `type` == `gcs` | + + +### Indexer Configuration {#reference_indexer} + +The `values.yaml` file contains a `indexer` object used to configure one or more indexer components to be deployed as part of the release, explicitly referencing brokers defined in the previous section. + +A single "indexer" object is defined as a set of a **database**, the **API exposing data in that database** and a **consumer component** responsible for pulling the indexer metadata into that database. + +| Variable | Type | Description | Default | Applicable If | +|--------------------------------------|---------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------|--------------------------------------| +| indexer | object | The object containing the information on all of the indexer(s) to be deployed as part of the Helm installation. | | | +| indexer.IDX_ID | object | The object representing the individual indexer's configuration. | | | +| indexer.IDX_ID.broker | string | Reference to the broker the indexer metadata is retrieved from. | | | +| indexer.IDX_ID.topic | string | The topic the indexer metadata are pulled from. | | | +| indexer.IDX_ID.consumerID | string | The consumer identifier (subscription, consumer group, etc.) | | | +| indexer.IDX_ID.deadletterTopic | string | The name of the dead-letter topic used for failed data processing. If not set, dead-lettering is disabled (must be enabled if `indexer.IDX_ID.broker` == `kafka`). | | | +| indexer.IDX_ID.batchSize | string | Maximum number of messages in a batch. | `"10000"` | | +| indexer.IDX_ID.batchMemory | string | Maximum bytes in batch. | `"1000000"` | | +| indexer.IDX_ID.batchTimeout | string | Time to wait before writing batch if it is not full. | `30s` | | +| indexer.IDX_ID.replicas | integer | The number of replicas of the indexer API microservice. | | | +| indexer.IDX_ID.consumerReplicas | integer | The number of replicas of a given indexer instance to pull/process messages simultaneously. | | | +| indexer.IDX_ID.storageSize | string | The size of the Indexer's underlying Mongo storage. | | | +| indexer.IDX_ID.mongoConnectionString | string | An optional MongoDB connection string if using an external database. If this value is not set, a MongoDB instance will be deployed on the same cluster. | | | +| indexer.IDX_ID.serviceAccountSecret | string | The reference to a secret that contains a `key.json` key and the contents of a Google Service Account JSON file as its contents. | | `brokers.BROKER_ID.type` == `pubsub` | +| indexer.IDX_ID.serviceAccountKey | string | A Google Service Account private key in JSON format, base64 encoded. Used to create a new `serviceAccountSecret` secret, if provided. | | `brokers.BROKER_ID.type` == `pubsub` | + + +### Resubmitter Configuration {#reference_resubmitter} + +The `values.yaml` file contains a `resubmitter` object used to configure one or more resubmitter components to be deployed as part of the release, explicitly referencing brokers, indexer and storage objects defined in the previous section. + +| Variable | Type | Description | Applicable If | +|------------------------------------------|--------|---------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------| +| resubmitter | object | The object containing the information on all of the resubmitter services to be deployed as part of the Helm installation. | | +| resubmitter.RSMB_ID | object | The object representing the individual resubmitter's configuration. | | +| resubmitter.RSMB_ID.broker | string | Reference to the broker resubmitted messages will be sent to. | | +| resubmitter.RSMB_ID.storage | string | Reference to the storage the resubmitter service can connect to. | | +| resubmitter.RSMB_ID.indexer | string | The indexer instance whose API will be used to query the needed metadata. | | +| resubmitter.RSMB_ID.clientID | string | The Client ID of the Azure Service Principal with the required role assignments. | `brokers.BROKER_ID.type` == `servicebus` or `storage.STORAGE_ID.type` == `abs` | +| resubmitter.RSMB_ID.clientSecret | string | The Client Secret of the Azure Service Principal with the required role assignments. | `brokers.BROKER_ID.type` == `servicebus` or `storage.STORAGE_ID.type` == `abs` | +| resubmitter.RSMB_ID.tenantID | string | The Tenant ID of the Azure Service Principal with the required role assignments. | `brokers.BROKER_ID.type` == `servicebus` or `storage.STORAGE_ID.type` == `abs` | +| resubmitter.RSMB_ID.serviceAccountSecret | string | The reference to a secret that contains a `key.json` key and the contents of a Google Service Account JSON file as its contents. | `brokers.BROKER_ID.type` == `pubsub` or `storage.STORAGE_ID.type` == `gcs` | +| resubmitter.RSMB_ID.serviceAccountKey | string | A Google Service Account private key in JSON format, base64 encoded. Used to create a new `serviceAccountSecret` secret, if provided. | `brokers.BROKER_ID.type` == `pubsub` or `storage.STORAGE_ID.type` == `gcs` | + + +### Persistor Configuration {#reference_persistor} +The `values.yaml` file contains a `persistor` object used to configure one or more Persistor components to be deployed as part of the release, explicitly referencing the objects defined as part of the previous section. + +| Variable | Type | Description | Default | Applicable If | +|---------------------------------------|--------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------|--------------------------------------------------------------------------------| +| persistor | object | The object containing the information on all of the Persistors to be deployed as part of the Helm installation. | | | +| persistor.PES_ID | object | The object representing the individual Persistor's configuration. | | | +| persistor.PES_ID.broker | string | Reference to the broker messages are pulled from. | | | +| persistor.PES_ID.topic | string | The topic the messages are pulled from. | | | +| persistor.PES_ID.consumerID | string | The consumer identifier (subscription, consumer group, etc) | | | +| persistor.PES_ID.storage | string | The reference to the storage messages will be persisted to. | | | +| persistor.PES_ID.indexer | string | The reference to the indexer the Persistor is tied to. If not set, the Indexer plugin is disabled. | | | +| persistor.PES_ID.storageTargetID | string | The identifier of the storage container (bucket) to store the data to. | | | +| persistor.PES_ID.deadletterBroker | string | Optional reference to the broker containing the dead-letter topic. If the Indexer plugin is enabled, `indexer.broker` is used instead and this value is ignored. Required if the Indexer plugin is disabled and dead-lettering is enabled. | | | +| persistor.PES_ID.deadletterTopic | string | The name of the dead-letter topic used for failed data processing. If not set, dead-lettering is disabled (must be enabled if kafka broker is used). | | | +| persistor.PES_ID.batchSize | string | Maximum number of messages in a batch. | `"10000"` | | +| persistor.PES_ID.batchMemory | string | Maximum bytes in batch. | `"1000000"` | | +| persistor.PES_ID.batchTimeout | string | Time to wait before writing batch if it is not full. | `30s` | | +| persistor.PES_ID.storagePrefix | string | Prefix to be given to all files stored to the chosen target blob storage. | `msg` | | +| persistor.PES_ID.storageVersionKey | string | Key in metadata that messages get grouped by. | | | +| persistor.PES_ID.storageMask | string | Structure of the path under which batches are stored. | `year/month/day/hour` | | +| persistor.PES_ID.storageCustomValues | string | Comma-separated list of key:value pairs to include in the file path. If the file path contains `key1:value1` and `STORAGE_MASK` contains `key1`, then the path will contain `value1`. | | | +| persistor.PES_ID.replicas | string | The number of replicas of a given Persistor instance to pull/process messages simultaneously. | | | +| persistor.PES_ID.clientID | string | The Client ID of the Azure Service Principal with the required role assignments. | | `brokers.BROKER_ID.type` == `servicebus` or `storage.STORAGE_ID.type` == `abs` | +| persistor.PES_ID.clientSecret | string | The Client Secret of the Azure Service Principal with the required role assignments. | | `brokers.BROKER_ID.type` == `servicebus` or `storage.STORAGE_ID.type` == `abs` | +| persistor.PES_ID.tenantID | string | The Tenant ID of the Azure Service Principal with the required role assignments. | | `brokers.BROKER_ID.type` == `servicebus` or `storage.STORAGE_ID.type` == `abs` | +| persistor.PES_ID.serviceAccountSecret | string | The reference to a secret that contains a `key.json` key and the contents of a Google Service Account JSON file as its contents. | | `brokers.BROKER_ID.type` == `pubsub` or `storage.STORAGE_ID.type` == `gcs` | +| persistor.PES_ID.serviceAccountKey | string | A Google Service Account private key in JSON format, base64 encoded. Used to create a new `serviceAccountSecret` secret, if provided. | | `brokers.BROKER_ID.type` == `pubsub` or `storage.STORAGE_ID.type` == `gcs` | + diff --git a/dataphos-docs/content/persistor/configuration/pulumi.md b/dataphos-docs/content/persistor/configuration/pulumi.md new file mode 100644 index 0000000..99ae455 --- /dev/null +++ b/dataphos-docs/content/persistor/configuration/pulumi.md @@ -0,0 +1,238 @@ +--- +title: "Pulumi" +draft: false +weight: 3 +--- + +# Configuration + +There are three possible sources of resource configuration values: user configuration in the active stack configuration file, retrieved data from existing resources, and default system-level configuration from the application code. + +User configuration will always take precedence over other configuration sources. If there is no special user configuration for a parameter, the retrieved value from the resource’s previous configuration will be used. If there wasn’t any data retrieved for the resource (as it is being created for the first time), the default system-level configuration value will be used instead. The default values for parameters are listed in the appropriate section of the configuration options. + +If the configuration references an existing cloud resource, the program will retrieve its data from the cloud provider and import the resource into the active stack instead of creating a new one. If the user configuration values specify any additional parameters that differ from the resource configuration while it has not yet been imported into the stack, the deployment will fail. To modify an existing resource’s configuration, import it into the stack first and then redeploy the infrastructure with the desired changes. + +Note: Implicit import of an AKS cluster is currently not supported. To use an existing AKS cluster in your infrastructure, set the AKS cluster's import configuration option to true. + +⚠️ WARNING ⚠️ + +Imported resources will NOT be retained by default when the infrastructure is destroyed. If you want to retain a resource when the infrastructure is destroyed, you need to explicitly set its retain flag to true in the active stack's configuration file. Retained resources will not be deleted from the backing cloud provider, but will be removed from the Pulumi state on a pulumi destroy. Azure resource groups and GCP projects are set to be retained by default and can be deleted manually. Be careful if you choose not to retain them, as destroying them will remove ALL children resources, even the ones created externally. It is recommended to modify these options only if you are using a dedicated empty project/resource group. + + +## Global Configuration + +Below is the shared configuration used between all Persistor types. + +| Variable | Type | Description | Default value | +|--------------------------|---------|----------------------------------------------------------------------------------------------------------------|---------------| +| `namespace` | string | The name of the Kubernetes namespace where Dataphos Helm charts will be deployed to. | `dataphos` | +| `deployPersistor` | boolean | Whether the Persistor Helm chart should be deployed. | `false` | +| `retainResourceGroups` | boolean | Whether Azure resource groups should be retained when the infrastructure is destroyed. | `true` | +| `retainProjects` | boolean | Whether GCP projects should be retained when the infrastructure is destroyed. | `true` | +| `resourceTags` | object | Set of `key:value` tags attached to all Azure resource groups; or set of labels attached to all GCP resources. | | + + +## Product Configuration + +The `namespace` and `images` options at the top-level of the Helm chart configurations are set by default and do not need to be manually configured. + +Cloud-specific variables should not be manually configured. Depending on the configured cloud provider, service accounts with appropriate roles are automatically created and their credentials are used to populate these variables. + +| Variable | Type | Description | +|---------------------------------------|--------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `dataphos-persistor` | object | Dataphos Persistor Helm chart configuration. Configuration options are listed in the [persistor general configuration]({{< ref "helm#reference_persistor_general">}}). | +| `dataphos-persistor.persistor` | object | The object containing the information on all of the Persistors to be deployed. Configuration options are listed in the [persistor configuration]({{< ref "helm#reference_persistor">}}). | +| `dataphos-persistor.indexer` | object | The object containing the information on all of the indexers to be deployed. Configuration options are listed in the [indexer configuration]({{< ref "helm#reference_indexer">}}). | +| `dataphos-persistor.resubmitter` | object | The object containing the information on all of the resubmitter services to be deployed. Configuration options are listed in the [resubmitter configuration]({{< ref "helm#reference_resubmitter">}}). | + + + +## Provider configuration options +The variables listed here are required configuration options by their respective Pulumi providers. Your entire infrastructure should reside on a single cloud platform. Deployment across multiple cloud platforms is currently not fully supported. + +{{< tabs "Provider configuration options" >}} + +{{< tab "Azure" >}} +| Variable | Type | Description | Example value | +|-------------------------|--------|------------------------------------|---------------| +| `azure-native:location` | string | The default resource geo-location. | `westeurope` | + +A list of all configuration options for this provider can be found here: +[Azure Native configuration options](https://www.pulumi.com/registry/packages/azure-native/installation-configuration/#configuration-options). + +{{}} + + +{{< tab "GCP" >}} +To successfully deploy resources in a GCP project, the appropriate APIs need to be enabled for that project in the API Console. See: [Enable and disable APIs](https://support.google.com/googleapi/answer/6158841). + +| Variable | Type | Description | Example value | +|---------------|--------|--------------------------|-------------------| +| `gcp:project` | string | The default GCP project. | `syntio-dataphos` | +| `gcp:region` | string | The default region.. | `europe-west2` | +| `gcp:zone` | string | The default zone. | `europe-west2-a` | + +A list of all configuration options for this provider can be found here: +[GCP configuration options](https://www.pulumi.com/registry/packages/gcp/installation-configuration/#configuration-reference). + +{{}} +{{}} + +## Cluster configuration options + +The stack configuration `cluster` object is utilized to configure the Kubernetes cluster necessary to deploy the Helm charts that comprise Dataphos products. + +### Common cluster configuration + +| Variable | Type | Description | +|-----------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `cluster` | object | The object containing the general information on the cluster. | +| `cluster.CLUSTER_ID` | object | The object representing an individual cluster's configuration. | +| `cluster.CLUSTER_ID.type` | string | The type of the managed cluster. Valid values: [`gke`, `aks`]. | +| `cluster.CLUSTER_ID.name` | string | The name of the managed cluster. | +| `cluster.CLUSTER_ID.retain` | boolean | If set to true, resource will be retained when infrastructure is destroyed. Retained resources will not be deleted from the backing cloud provider, but will be removed from the Pulumi state. | + +### Specific cluster configuration + +{{< tabs "Cluster configuration options" >}} + +{{< tab "AKS" >}} +| Variable | Type | Description | Default value | +|----------------------------------------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------| +| `cluster.CLUSTER_ID.import` | boolean | Whether to use an existing AKS cluster instead of creating a new one.
**Note:** AKS clusters imported in this way will be retained on destroy, unless its resource group is not retained. | `false` | +| `cluster.CLUSTER_ID.resourceGroup` | string | The name of the resource group. The name is case insensitive. | | +| `cluster.CLUSTER_ID.sku` | object | The managed cluster SKU. | | +| `cluster.CLUSTER_ID.sku.name` | string | The managed cluster SKU name. | `Basic` | +| `cluster.CLUSTER_ID.sku.tier` | string | The managed cluster SKU tier. | `Free` | +| `cluster.CLUSTER_ID.dnsPrefix` | string | The cluster DNS prefix. This cannot be updated once the Managed Cluster has been created. | | +| `cluster.CLUSTER_ID.agentPoolProfiles` | object | The agent pool properties. | | +| `cluster.CLUSTER_ID.agentPoolProfiles.name` | string | Windows agent pool names must be 6 characters or less. | | +| `cluster.CLUSTER_ID.agentPoolProfiles.count` | integer | Number of agents (VMs) to host docker containers. | `3` | +| `cluster.CLUSTER_ID.agentPoolProfiles.enableAutoScaling` | boolean | Whether to enable auto-scaler. | `false` | +| `cluster.CLUSTER_ID.agentPoolProfiles.minCount` | integer | The minimum number of nodes for auto-scaling. | `1` | +| `cluster.CLUSTER_ID.agentPoolProfiles.maxCount` | integer | The maximum number of nodes for auto-scaling. | `5` | +| `cluster.CLUSTER_ID.agentPoolProfiles.vmSize` | string | VM size availability varies by region. See: [Supported VM sizes](https://docs.microsoft.com/azure/aks/quotas-skus-regions#supported-vm-sizes) | `Standard_DS2_v2` | +| `cluster.CLUSTER_ID.tags` | object | Set of `key:value` tags attached to the AKS Cluster. This will override the global `resourceTags` configuration option for this resource. | | + + +{{}} + +{{< tab "GKE" >}} + +| Variable | Type | Description | Default value | +|----------------------------------------------------------------|-------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `cluster.CLUSTER_ID.projectID` | string | The project ID is a unique identifier for a GCP project. | | +| `cluster.CLUSTER_ID.location` | string | The geo-location where the resource lives. | | +| `cluster.CLUSTER_ID.initialNodeCount` | integer | The number of nodes to create in this cluster's default node pool. | `3` | +| `cluster.CLUSTER_ID.nodeConfigs` | object | Parameters used in creating the default node pool. | | +| `cluster.CLUSTER_ID.nodeConfig.machineType` | string | The name of a Google Compute Engine machine type. | `e2-medium` | +| `cluster.CLUSTER_ID.clusterAutoscalings` | object list | Per-cluster configuration of Node Auto-Provisioning with Cluster Autoscaler to automatically adjust the size of the cluster and create/delete node pools based on the current needs of the cluster's workload. | | +| `cluster.CLUSTER_ID.clusterAutoscalings[0].autoscalingProfile` | string | Lets you choose whether the cluster autoscaler should optimize for resource utilization or resource availability when deciding to remove nodes from a cluster. Valid values: [`BALANCED`, `OPTIMIZE_UTILIZATION`]. | `BALANCED` | +| `cluster.CLUSTER_ID.clusterAutoscalings[0].enabled` | boolean | Whether node auto-provisioning is enabled. | `false` | +| `cluster.CLUSTER_ID.clusterAutoscalings[0].resourceLimits` | object list | Global constraints for machine resources in the cluster. Configuring the cpu and memory types is required if node auto-provisioning is enabled. | resourceLimits:
- resource_type: cpu
  minimum: 1
  maximum: 1
- resource_type: memory
  minimum: 1
  maximum: 1 | +| `cluster.CLUSTER_ID.resourceLabels` | object | Set of `key:value` labels attached to the GKE Cluster. This will override the global `resourceTags` configuration option for this resource. | | + +{{}} +{{}} + +## Broker configuration options +The stack configuration `brokers` object is used to set up the key references to be used by the dataphos components to connect to one or more brokers deemed to be part of the overall platform infrastructure. + +Product configs directly reference brokers by their `BROKER_ID` listed in the broker config. The same applies to `TOPIC_ID` and `SUB_ID` – the keys of those objects are the actual names of the topics and subscriptions used. + +### Common broker configuration + +| Variable | Type | Description | +|--------------------------------------------------------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `brokers` | object | The object containing the general information on the brokers. | +| `brokers.BROKER_ID` | object | The object representing an individual broker's configuration. | +| `brokers.BROKER_ID.type` | string | Denotes the broker's type. Valid values: [`kafka`, `pubsub`, `servicebus`]. | +| `brokers.BROKER_ID.topics` | object | The object containing the general information on the topics. | +| `brokers.BROKER_ID.topics.TOPIC_ID` | object | The object representing an individual topic's configuration. | +| `brokers.BROKER_ID.topics.TOPIC_ID.retain` | boolean | If set to true, resource will be retained when infrastructure is destroyed. Retained resources will not be deleted from the backing cloud provider, but will be removed from the Pulumi state. | +| `brokers.BROKER_ID.topics.TOPIC_ID.subscriptions` | object | The object containing topic subscription (consumer group) configuration. | +| `brokers.BROKER_ID.topics.TOPIC_ID.subscriptions.SUBSCRIPTION_ID` | object | The object representing an individual topic subscription's configuration. | +| `brokers.BROKER_ID.topics.TOPIC_ID.subscriptions.SUBSCRIPTION_ID.retain` | boolean | If set to true, resource will be retained when infrastructure is destroyed. Retained resources will not be deleted from the backing cloud provider, but will be removed from the Pulumi state. | + +The Azure storage account type. Valid values: [`Storage`, `StorageV2`, `BlobStorage`, `BlockBlobStorage`, `FileStorage`]. The default and recommended value is `BlockBlobStorage`. + +### Specific broker configuration + +{{< tabs Broker configuration options >}} +{{< tab "Azure Service Bus" >}} +| Variable | Type | Description | +|-----------------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `brokers.BROKER_ID.azsbNamespace` | string | The Azure Service Bus namespace name. | +| `brokers.BROKER_ID.resourceGroup` | string | The Azure Service Bus resource group name. | +| `brokers.BROKER_ID.sku` | object | The Azure Service Bus namespace SKU properties. | +| `brokers.BROKER_ID.sku.name` | string | Name of this SKU. Valid values: [`BASIC`, `STANDARD`, `PREMIUM`]. Default value is `STANDARD`. | +| `brokers.BROKER_ID.sku.tier` | string | The billing tier of this SKU. [`BASIC`, `STANDARD`, `PREMIUM`]. Default value is `STANDARD`. | +| `brokers.BROKER_ID.sku.capacity` | integer | The specified messaging units for the tier. For Premium tier, valid capacities are 1, 2 and 4. | +| `brokers.BROKER_ID.tags` | object | Set of `key:value` tags attached to the Azure Service Bus namespace. This will override the global `resourceTags` configuration option for this resource. | +| `brokers.BROKER_ID.retain` | boolean | If set to true, the Azure Service Bus namespace will be retained when infrastructure is destroyed. Retained resources will not be deleted from the backing cloud provider, but will be removed from the Pulumi state. | + +{{}} + + +{{< tab "Google Cloud Pub/Sub" >}} +| Variable | Type | Description | +|--------------------------------------------------------------------------|--------|------------------------------------------------------------------------------------------------------------------------------------------------------| +| `brokers.BROKER_ID.projectID` | string | The GCP project ID. | +| `brokers.BROKER_ID.topics.TOPIC_ID.labels` | object | Set of `key:value` labels attached to the Pub/Sub topic. This will override the global `resourceTags` configuration option for this resource. | +| `brokers.BROKER_ID.topics.TOPIC_ID.subscriptions.SUBSCRIPTION_ID.labels` | object | Set of `key:value` labels attached to the Pub/Sub subscription. This will override the global `resourceTags` configuration option for this resource. | + +{{}} + +{{< tab "Kafka" >}} +| Variable | Type | Description | Default value | +|------------------------------------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------| +| `brokers.BROKER_ID.brokerAddr` | string | The Kafka bootstrap server address. Optional. If omitted or empty, a new Strimzi Kafka cluster operator and cluster will be deployed with default settings. | | +| `brokers.BROKER_ID.clusterName` | string | The name of the Strimzi Kafka cluster custom Kubernetes resource. | `kafka-cluster` | +| `brokers.BROKER_ID.clusterNamespace` | string | The Kubernetes namespace where the cluster will be deployed. | `kafka-cluster` | +| `brokers.BROKER_ID.topics.TOPIC_ID.partitions` | integer | Number of partitions for a specific topic. | `3` | +| `brokers.BROKER_ID.topics.TOPIC_ID.replicas` | integer | Number of replicas for a specific topic. | `1` | + +{{}} +{{}} + + +## Storage configuration options +The stack configuration `storage` object is used to set up the key references to be used by the dataphos components to connect to one or more storage destinations deemed as part of the overall platform infrastructure. + +Product configs directly reference storage components by their `STORAGE_ID` listed in the storage config. The same applies to `BUCKET_ID` – the keys of those objects are the actual names of the buckets used. + +### Common storage configuration + +| Variable | Type | Description | +|-----------------------------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `storage` | object | The object containing the general information on the storage services. | +| `storage.STORAGE_ID` | object | The object representing an individual storage destination configuration. | +| `storage.STORAGE_ID.type` | string | Denotes the storage type. Valid values: [`abs`, `gcs`]. | +| `storage.STORAGE_ID.buckets` | object | The object containing the general information on the buckets. | +| `storage.STORAGE_ID.buckets.BUCKET_ID` | object | The object representing an individual bucket. | +| `storage.STORAGE_ID.buckets.BUCKET_ID.retain` | boolean | If set to true, resource will be retained when infrastructure is destroyed. Retained resources will not be deleted from the backing cloud provider, but will be removed from the Pulumi state. | + +### Specific storage configuration + +{{< tabs "Storage configuration options" >}} + +{{< tab "Azure Blob Storage" >}} +| Variable | Type | Description | +|---------------------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `storage.STORAGE_ID.accountStorageID` | string | The Azure storage account name. | +| `storage.STORAGE_ID.resourceGroup` | string | The Azure resource group name. | +| `storage.STORAGE_ID.kind` | string | The Azure storage account type. Valid values: [`Storage`, `StorageV2`, `BlobStorage`, `BlockBlobStorage`, `FileStorage`]. The default and recommended value is `BlockBlobStorage`. | +| `storage.STORAGE_ID.tags` | object | Set of `key:value` tags attached to the Azure storage account. This will override the global `resourceTags` configuration option for this resource. | +| `storage.STORAGE_ID.retain` | boolean | If set to true, the Azure storage account will be retained when infrastructure is destroyed. Retained resources will not be deleted from the backing cloud provider, but will be removed from the Pulumi state. | + +{{}} + + +{{< tab "Google Cloud Storage" >}} +| Variable | Type | Description | +|-----------------------------------------------|--------|--------------------------------------------------------------------------------------------------------------------------------------------| +| `storage.STORAGE_ID.projectID` | string | The GCP project ID. | +| `storage.STORAGE_ID.buckets.BUCKET_ID.labels` | object | Set of `key:value` labels attached to the GCS bucket. This will override the global `resourceTags` configuration option for this resource. | + +{{}} +{{}} diff --git a/dataphos-docs/content/persistor/configuration/shell.md b/dataphos-docs/content/persistor/configuration/shell.md new file mode 100644 index 0000000..a0a953b --- /dev/null +++ b/dataphos-docs/content/persistor/configuration/shell.md @@ -0,0 +1,373 @@ +--- +title: "Shell" +draft: false +weight: 1 +--- + +# Configuration + +Persistor is deployed as a set of Kubernetes resources, each of which is highly configurable. + +The tables below contain the variables that can be configured as part of the Persistor's configuration. + +## Persistor Core + +Below are the variables used to configure the main Persistor component. The broker-specific configuration options should be taken into consideration along with the "Common" variables. + +{{< tabs "Configuration" >}} +{{< tab "Common Configuration" >}} + +## Common Configuration + +Below is the shared configuration used between all Persistor types. + +| Variable | Example Value | Possible Values | Description | Required | +|----------------------------|---------------------|-----------------------------------------------------|----------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------| +| READER_TYPE | "pubsub" | ["pubsub", "kafka", "servicebus"] | Type of broker used for reader | yes | +| SENDER_TYPE | "pubsub" | ["pubsub", "kafka", "servicebus"] | Type of broker used for sender | no (yes if dead letter or indexer are enabled) | +| STORAGE_TYPE | "gcs" | ["azure", "gcs"] | Type of storage used | yes | +| STORAGE_DESTINATION | "my_bucket" | N/A | Name of GCS bucket or ABS container | yes | +| STORAGE_TOPICID | "my_topic" | N/A | Topic's name | yes | +| STORAGE_EXTENSION | "avro" | N/A | Extension of the files stored to blob storage. | yes | +| DEADLETTERENABLED | "true" | ["true", "false"] | Whether messages will be sent to dead letter upon error | Default: "true" (must not set to false if reader is kafka) | +| SENDER_DEADLETTERTOPIC | "persistor_dltopic" | N/A | Dead letter topic name | no (yes if reader is kafka) | + +### Enabling the Indexer Plugin + +Below are the variables to be used when deploying the Persistor alongside the Indexer plugin. + +| Variable | Example Value | Possible Values | Description | Required | +|------------------------|--------------------------|-------------------------------------------------------------------|:---------------------------------------------------------------------:|-------------------------------------------------------| +| INDEXERENABLED | "true" | ["true", "false"] | Whether to send messages to Indexer topic or not. | Default: true (set to false if Indexer is not needed) | +| SENDER_TOPICID | "persistor-indexertopic" | N/A | ID of the topic used for communication between Persistor and Indexer. | yes if indexer is enabled | + +{{}} +{{< tab "Additional Configuration" >}} + +## Additional Configuration + +Below are more advanced configuration options available as part of the Persistor's configuration suite. + +| Variable | Example Value | Possible Values | Description | Required | +|----------------------------|--------------------------|---------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------|---------------------------------| +| BATCHSETTINGS_BATCHSIZE | "5000" | N/A | Maximum number of messages in a batch. | Default: 5000 | +| BATCHSETTINGS_BATCHMEMORY | "1000000" | N/A | Maximum bytes in batch. | Default: 1000000 | +| BATCHSETTINGS_BATCHTIMEOUT | "30s" | N/A | Time to wait before writing batch if it is not full. | Default: 30s | +| STORAGE_MASK | "day/hour" | any subset of (year, month, day, hour), custom values and message attributes separated by / | Path within a blob storage where messages will be saved. | Default: "year/month/day/hour/" | +| STORAGE_CUSTOMVALUES | "org:company,dept:sales" | Comma-separated list of pairs of the form key:value | Values to include in file path. If it contains key1:value1 and STORAGE_MASK has key1, then path will contain value1. | no | +| MINIMUM_LOG_LEVEL | "WARN" | ["WARN", "ERROR", "INFO"] | The minimal level used in logs. | Default: "INFO" | +| STORAGE_PREFIX | "msg" | N/A | Prefix to be given to all files stored to the chosen target blob storage. | Default: "msg" | + + +### Notes on storage path + +Batch locations on storage can be made dependent on message attributes. If batching by a specific message attribute key is desired, it needs to be included in the `STORAGE_MASK` variable between curly {} parentheses. Messages with missing attributes will have the appropriate part of the path replaced by the `unknown` keyword. + +If STORAGE_MASK is configured as `month/day/{schema_ID}/{publish_reason}`, messages received on October 31 with metadata `{schema_ID:schema_A, publish_reason:audit}` will be stored in blobs under the path `10/31/schema_A/audit`. The remaining attributes other than these two have no effect on the batching. + +Messages missing the `publish_reason` metadata key will be batched in `10/31/schema_A/unknown`. + +The start of a path can be configured using custom values. If `STORAGE_MASK` starts with `path_prefix/` and the variable `STORAGE_CUSTOMVALUES` is set to `path_prefix:persistor-location`, paths in the chosen blob storage destination will start with `persistor-location/`. + +{{}} +{{< tab "GCP Pub/Sub" >}} + +## GCP Configuration + +Below are the variables relevant to users utilizing Google PubSub as the broker type messages are being pulled from and/or using Google Cloud Storage as the destination storage option. + +### PubSub used + +| Variable | Example Value | Possible Values | Description | Required | +|-------------------------|------------------|-----------------|:-------------------------------------------:|------------------------------------------------| +| READER_PUBSUB_PROJECTID | "my-gcp-project" | N/A | ID of the GCP project you're working on | yes | +| SENDER_PUBSUB_PROJECTID | "my-gcp-project" | N/A | ID of the GCP project you're working on | no (yes if indexer or dead letter are enabled) | +| READER_PUBSUB_SUBID | "persistor-sub" | N/A | Pubsub subscription that messages come from | yes | + +{{}} +{{< tab "Azure" >}} + +## Azure Configuration + +Below are the variables relevant to users utilizing Azure Service Bus as the broker type messages are being pulled from and/or using Azure Blob Storage as the destination storage option. + +### Common Configuration + +| Variable | Example Value | Possible Values | Description | Required | +|---------------------|----------------------------------------|-----------------|------------------------------------------|----------| +| AZURE_CLIENT_ID | "19b725a4-1a39-5fa6-bdd0-7fe992bcf33c" | N/A | Client ID of your Service Principal | yes | +| AZURE_TENANT_ID | "38c345b5-1b40-7fb6-acc0-5ab776daf44e" | N/A | Tenant ID of your Service Principal | yes | +| AZURE_CLIENT_SECRET | "49d537a6-8a49-5ac7-ffe1-6fe225abe33f" | N/A | Client secret of your Service Principal | yes | + +### Azure Service Bus used + +| Variable | Example Value | Possible Values | Description | Required | +|------------------------------------|--------------------------|-----------------|:------------------------------------------------------:|-----------------------------------------------| +| READER_SERVICEBUS_CONNECTIONSTRING | "Endpoint=sb://..." | N/A | Connection string for the service bus namespace | yes | +| READER_SERVICEBUS_TOPICID | "persistor-topic" | N/A | Service bus topic name | yes | +| READER_SERVICEBUS_SUBID | "persistor-subscription" | N/A | Service bus subscription name | yes | +| SENDER_SERVICEBUS_CONNECTIONSTRING | "Endpoint=sb://..." | N/A | Connection string for the sender service bus namespace | no (yes is indexer or deadletter are enabled) | + +### Azure Blob Storage used +| Variable | Example Value | Possible Values | Description | Required | +|--------------------------|--------------------------|-----------------|:-------------------------:|-----------------------------------------------| +| STORAGE_STORAGEACCOUNTID | "persistor-storage" | N/A | ID of the storage account | yes | + +{{}} +{{< tab "Kafka" >}} + +## Kafka Configuration + +Below are the variables relevant to users utilizing Apache Kafka as the broker type messages are being pulled from. Should be used in conjunction with GCS and Azure Blob Storage as the destination storage of choice. + +### Kafka used + +| Variable | Example Value | Possible Values | Description | Required | +|-----------------------------|-------------------|-----------------|:---------------------------------------------:|----------| +| READER_KAFKA_ADDRESS | "localhost:9092" | N/A | Address of the kafka broker | yes | +| READER_KAFKA_GROUPID | "persistor" | N/A | Consumer group's name. | yes | +| READER_KAFKA_TOPICID | "persistor-topic" | N/A | Kafka source topic name | yes | +| SENDER_KAFKA_ADDRESS | "localhost:9092" | N/A | Address of the kafka broker for sender | yes | + +{{}} +{{}} + +## Indexer + +Below are the variables used to configure the Indexer component -- the component responsible for pulling the Indexer metadata generated by the Persistor from the dedicated broker configuration and storing it in the NoSQL (Mongo) database for resubmission purposes. + +An Indexer "type" is determined based on the message broker used as the communication channel to receive the required metadata. + +{{< tabs "Indexer Configuration" >}} +{{< tab "Common Configuration" >}} + +## Common Configuration + +Below is the shared configuration between all Indexer types. + +| Variable | Example Value | Possible Values | Description | Required | +|----------------------------|--------------------------------------------------|-----------------------------------|-------------------------------------------------------------|------------------------------------------------------------------| +| READER_TYPE | "pubsub" | ["pubsub", "kafka", "servicebus"] | Type of broker used | yes | +| SENDER_TYPE | "pubsub" | ["pubsub", "kafka", "servicebus"] | Type of broker used for sender | no (yes if reader is kafka or if indexer is enabled) | +| DEADLETTERENABLED | "true" | ["true", "false"] | Whether messages will be sent to dead letter upon error | no (yes, "true" if reader is kafka, otherwise defaults to false) | +| SENDER_DEADLETTERTOPIC | "persistor-dltopic" | N/A | Dead letter topic name | no (yes if reader is kafka) | +| MONGO_CONNECTIONSTRING | "mongodb://mongo-0.mongo-service.dataphos:27017" | N/A | MongoDB connection string | yes | +| MONGO_DATABASE | "indexer_db" | N/A | Mongo database name to store metadata in | yes | +| MONGO_COLLECTION | "indexer_collection" | N/A | Mongo collection name (will be created if it doesn’t exist) | yes | +| MINIMUM_LOG_LEVEL | "WARN" | ["WARN", "ERROR", "INFO"] | The minimal level used in logs. | Default: "INFO" | + +{{}} + +{{< tab "Advanced Configuration" >}} + +## Advanced Configuration + +Below are the additional configuration options offered by the Indexer. + +| Variable | Example Value | Possible Values | Description | Required | +|----------------------------|--------------------------------------|---------------------------|----------------------------------------------------------|-------------------| +| BATCHSETTINGS_BATCHSIZE | "5000" | N/A | Maximum number of messages in a batch. | Default: 5000 | +| BATCHSETTINGS_BATCHMEMORY | "1000000" | N/A | Maximum bytes in batch. | Default: 1000000 | +| BATCHSETTINGS_BATCHTIMEOUT | "30s" | N/A | Time to wait before writing batch if it is not full. | Default: 30s | + +{{}} + +{{< tab "GCP" >}} + +## GCP Configuration + +Below are the configuration options if Google PubSub is used as the communication channel between the components. + +| Variable | Example Value | Possible Values | Description | Required | +|-------------------------|------------------|-----------------|:-------------------------------------------:|---------------------------------| +| READER_PUBSUB_PROJECTID | "my-gcp-project" | N/A | ID of the GCP project you're working on | yes | +| SENDER_PUBSUB_PROJECTID | "my-gcp-project" | N/A | ID of the GCP project you're working on | no (yes dead letter is enabled) | +| READER_PUBSUB_SUBID | "indexer-sub" | N/A | Pubsub subscription that messages come from | yes | + +{{}} +{{< tab "Azure" >}} + +### Azure Service Bus Configuration + +Below are the configuration options if Azure Service Bus is used as the communication channel between the components. + +| Variable | Example Value | Possible Values | Description | Required | +|------------------------------------|---------------------------|-----------------|:------------------------------------------------------:|-----------------------------------------------| +| READER_SERVICEBUS_CONNECTIONSTRING | "Endpoint=sb://..." | N/A | Connection string for the service bus namespace | yes | +| READER_SERVICEBUS_TOPICID | "persistor-indexer-topic" | N/A | Service bus topic name | yes | +| READER_SERVICEBUS_SUBID | "indexer-subscription" | N/A | Service bus subscription name | yes | +| SENDER_SERVICEBUS_CONNECTIONSTRING | "Endpoint=sb://..." | N/A | Connection string for the sender service bus namespace | no (yes is indexer or deadletter are enabled) | + +{{}} +{{< tab "Kafka" >}} + + +### Kafka Configuration + +Below are the configuration options if Apache Kafka is used as the communication channel between the components. + +| Variable | Example Value | Possible Values | Description | Required | +|-----------------------------|------------------|-----------------|:---------------------------------------------:|----------| +| READER_KAFKA_ADDRESS | "localhost:9092" | N/A | Address of the kafka broker | yes | +| READER_KAFKA_GROUPID | "indexer" | N/A | Consumer group's name. | yes | +| READER_KAFKA_TOPICID | "indexer-topic" | N/A | Kafka source topic name | yes | +| SENDER_KAFKA_ADDRESS | "localhost:9092" | N/A | Address of the kafka broker for sender | yes | + +{{}} +{{}} + +# Indexer API + +The Indexer API is created on top of the initialized Mongo database and used to query the Indexer metadata. + +{{< tabs "Indexer API Configuration" >}} +{{< tab "Simple Configuration" >}} + +## Simple Configuration + +Below are the minimum configuration options required for the Indexer API to work. + +| Variable | Example Value | Possible Values | Description | Required | +|-------------------|-----------------------------------------|---------------------------|:------------------------------------------------------:|------------------| +| CONN | "mongodb://mongo-0.mongo-service:27017" | N/A | MongoDB connection string. | yes | +| DATABASE | "indexer_db" | N/A | MongoDB database from which Indexer will read messages | yes | + + +{{}} +{{< tab "Advanced Configuration" >}} + +## Advanced Configuration + +Below are additional configuration options offered by the Indexer API. + +| Variable | Example Value | Possible Values | Description | Required | +|-------------------|-----------------------------------------|---------------------------|:------------------------------------------------------:|------------------| +| MINIMUM_LOG_LEVEL | "WARN" | ["WARN", "ERROR", "INFO"] | The minimal level used in logs. | Default: "INFO" | +| SERVER_ADDRESS | ":8080" | N/A | Port on which Indexer API will listen for traffic | Default: ":8080" | +| USE_TLS | "false" | ["true", "false"] | Whether to use TLS or not | Default: "false" | +| SERVER_TIMEOUT | "2s" | N/A | The amount of time allowed to read request headers | Default: "2s" | + +{{}} +{{}} + +# Resubmitter API + +The Resubmitter API is connected to the Indexer API for efficient fetching of data. It is dependent on the type of storage it is meant to query and the destination broker. + +{{< tabs "Resubmitter API Configuration" >}} +{{< tab "Common Configuration" >}} + +## Common Configuration + +Below are the common configuration options for the Resubmitter API. + +| Variable | Example Value | Possible Values | Description | Required | +|--------------------|-----------------------------|-----------------------------------|:---------------------------------------------------------------------------:|------------------| +| INDEXER_URL | "http://34.77.44.130:8080/" | N/A | The URL of the Indexer API with which the Resubmitter will communicate with | yes | +| STORAGE_TYPE | "gcs" | ["gcs", "abs"] | Type of storage used by Persistor | yes | +| PUBLISHER_TYPE | "pubsub" | ["servicebus", "kafka", "pubsub"] | Type of broker used | yes | +| SERVER_ADDRESS | ":8081" | N/A | Port on which Resubmitter will listen for traffic | Default: ":8081" | + + +{{}} + +{{< tab "Advanced Configuration" >}} + +## Advanced Configuration + +Below are the additional configuration options offered by the Resubmitter API. + +| Variable | Example Value | Possible Values | Description | Required | +|-----------------------------|-----------------------------|-----------------------------------------------|:--------------------------------------------------------------------------------:|------------------| +| MINIMUM_LOG_LEVEL | "WARN" | ["WARN", "ERROR", "INFO"] | The minimal level used in logs. | Default: "INFO" | +| RSB_META_CAPACITY | "20000" | N/A | Maximum number of messages which Indexer will return from MongoDB at once | Default: "10000" | +| RSB_FETCH_CAPACITY | "200" | N/A | Maximum number of workers in Resubmitter that are used for fetching from storage | Default: "100" | +| RSB_WORKER_NUM | "3" | N/A | Number of workers in Resubmitter that are used for packaging records | Default: "2" | +| RSB_ENABLE_MESSAGE_ORDERING | "false" | ["true", "false"] | Whether to publish messages using ordering keys | Default: "false" | +| USE_TLS | "false" | ["true", "false"] | Whether to use TLS or not | Default: "false" | +| SERVER_TIMEOUT | "2s" | N/A | The amount of time allowed to read request headers | Default: "2s" | + +{{}} + +{{< tab "GCP" >}} + +## GCP Configuration + +Below are the options to be configured if Google PubSub is used as the destination broker for resubmission and/or Google Cloud Storage is the data source used for the resubmission. + +### Common Configuration + +| Variable | Example Value | Possible Values | Description | Required | +|-------------------|------------------|-----------------|:---------------------:|----------| +| PUBSUB_PROJECT_ID | "my-gcp-project" | N/A | ID of the GCP project | yes | + +### PubSub as Target Broker + +| Variable | Example Value | Possible Values | Description | Required | +|----------------------------------|---------------|-------------------|:----------------------------------------------------------------------------------------------------------:|-----------------------| +| PUBLISH_TIMEOUT | "15s" | N/A | The maximum time that the client will attempt to publish a bundle of messages. | Default: "15s" | +| PUBLISH_BYTE_THRESHOLD | "50" | N/A | Publish a batch when its size in bytes reaches this value. | Default: "50" | +| PUBLISH_COUNT_THRESHOLD | "50" | N/A | Publish a batch when it has this many messages. | Default: "50" | +| PUBLISH_DELAY_THRESHOLD | "50ms" | N/A | Publish a non-empty batch after this delay has passed. | Default: "50ms" | +| NUM_PUBLISH_GOROUTINES | "52428800" | N/A | The number of goroutines used in each of the data structures that are involved along the the Publish path. | Default: "52428800" | +| MAX_PUBLISH_OUTSTANDING_MESSAGES | "800" | N/A | MaxOutstandingMessages is the maximum number of buffered messages to be published. | Default: "800" | +| MAX_PUBLISH_OUTSTANDING_BYTES | "1048576000" | N/A | MaxOutstandingBytes is the maximum size of buffered messages to be published. | Default: "1048576000" | +| PUBLISH_ENABLE_MESSAGE_ORDERING | "false" | ["true", "false"] | Whether to publish messages using oredering keys | Default: "false" | + +{{}} +{{< tab "Azure" >}} + +## Azure Configuration + +Below are the options to be configured if Azure Service Bus is used as the destination broker for resubmission and/or Azure Blob Storage is the data source used for the resubmission. + +### Common Configuration + +| Variable | Example Value | Possible Values | Description | Required | +|---------------------|----------------------------------------|-----------------|:---------------------------------------:|----------| +| AZURE_CLIENT_ID | "19b725a4-1a39-5fa6-bdd0-7fe992bcf33c" | N/A | Client ID of your Service Principal | yes | +| AZURE_TENANT_ID | "38c345b5-1b40-7fb6-acc0-5ab776daf44e" | N/A | Tenant ID of your Service Principal | yes | +| AZURE_CLIENT_SECRET | "49d537a6-8a49-5ac7-ffe1-6fe225abe33f" | N/A | Client secret of your Service Principal | yes | + +### Service Bus as Target Broker + +| Variable | Example Value | Possible Values | Description | Required | +|----------------------|---------------------------------------------------------------------|-----------------|:---------------------------------------:|----------| +| SB_CONNECTION_STRING | "Endpoint=sb://foo.servicebus.windows.net/;SharedAccessKeyName=Roo" | N/A | Connection string for Azure Service Bus | yes | + +### Azure Blob Storage used as Resubmission Source + +| Variable | Example Value | Possible Values | Description | Required | +|----------------------------|------------------|-----------------|:----------------------------------:|----------| +| AZURE_STORAGE_ACCOUNT_NAME | mystorageaccount | N/A | Name of the Azure Storage Account. | yes | + +{{}} +{{< tab "Kafka" >}} + +## Kafka Configuration + +Below are the options to be configured if Apache Kafka is used as the destination broker for resubmission. + +### Kafka as Target Broker + +| Variable | Example Value | Possible Values | Description | Required | +|---------------------------|------------------------|-------------------|:-------------------------------------------------------------------------------------------------------------------------------------------:|---------------------------------| +| KAFKA_BROKERS | "localhost:9092" | N/A | Comma-separated list of at least one broker which is a member of the target cluster | yes | +| KAFKA_USE_TLS | "false" | ["true", "false"] | Whether to use TLS or not | Default: "false" | +| KAFKA_USE_SASL | "false" | ["true", "false"] | Whether to use SASL or not | Default: "false" | +| SASL_USERNAME | "sasl_user" | N/A | SASL username | yes if using SASL, otherwise no | +| SASL_PASSWORD | "sasl_pwd" | N/A | SASL password | yes if using SASL, otherwise no | +| KAFKA_SKIP_VERIFY | "false" | ["true", "false"] | Controls whether a client verifies the server's certificate chain and host name | Default: "false" | +| KAFKA_DISABLE_COMPRESSION | "false" | ["true", "false"] | Whether to use message compression or not | Default: "false" | +| KAFKA_BATCH_SIZE | "40" | N/A | BatchSize sets the max amount of records the client will buffer, blocking new produces until records are finished if this limit is reached. | Default: "40" | +| KAFKA_BATCH_BYTES | "52428800" | N/A | BatchBytes parameter controls the amount of memory in bytes that will be used for each batch. | Default: "52428800" | +| KAFKA_BATCH_TIMEOUT | "10ms" | N/A | Linger controls the amount of time to wait for additional messages before sending the current batch. | Default: "10ms" | +| ENABLE_KERBEROS | "false" | ["true", "false"] | Whether to enable Kerberos or not | Default: false | +| KRB_CONFIG_PATH | "/path/to/config/file" | N/A | Path to the Kerberos configuration file | yes, if kerberos is enabled | +| KRB_REALM | "REALM.com" | N/A | domain over which a Kerberos authentication server has the authority to authenticate a user, host or service. | yes, if kerberos is enabled | +| KRB_SERVICE_NAME | "kerberos-service" | N/A | Service name we will get a ticket for. | yes, if kerberos is enabled | +| KRB_KEY_TAB | "/path/to/file.keytab" | N/A | Path to the keytab file | yes, if kerberos is enabled | +| KRB_USERNAME | "user" | N/A | Username of the service principal | yes, if kerberos is enabled | + +{{}} +{{}} \ No newline at end of file diff --git a/dataphos-docs/content/persistor/quickstart/_index.md b/dataphos-docs/content/persistor/quickstart/_index.md new file mode 100644 index 0000000..221fe00 --- /dev/null +++ b/dataphos-docs/content/persistor/quickstart/_index.md @@ -0,0 +1,14 @@ +--- +title: "Quickstart" +draft: false +weight: 3 +geekdocCollapseSection: true +--- + +There are 3 options for deploying dataphos components, including the Persistor: +{{< toc-tree >}} + +The quickstart guides will get you a working Persistor deployment as quickly as possible. Use any of the three deployment options and follow the guide. +The [Deployment Customization](/persistor/configuration) contains a detailed overview of configuration parameters if you wish to customize the configuration according to your requirements. + + diff --git a/dataphos-docs/content/persistor/quickstart/helm.md b/dataphos-docs/content/persistor/quickstart/helm.md new file mode 100644 index 0000000..21026cd --- /dev/null +++ b/dataphos-docs/content/persistor/quickstart/helm.md @@ -0,0 +1,145 @@ +--- +title: "Helm" +draft: false +weight: 2 +--- + +## Setting Up Your Environment + +### Prerequisites + +This quickstart guide will assume that you have [Helm](https://helm.sh/) installed. +If you happen to be using VS Code make sure to have the Kubernetes and Helm extensions installed to make life a little easier for you. Helm repository can be accessed on the [Helm repository](https://github.com/dataphos/dataphos-helm). + +Persistor has multiple message broker options and storage options. This quickstart guide will assume that the publishing message broker will be either GCP Pub/Sub, Azure ServiceBus or Kafka, and for storage options Google Cloud Storage(GCS) or Azure Blob Storage. These resources must be running before the deployment: + +{{< tabs "platformconfig" >}} +{{< tab "GCP (Pub/Sub to GCS)" >}} + +## Google PubSub to Google Cloud Storage +- Service account JSON key with the appropriate roles: ([Service Account Creation](https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console), [JSON Key Retrieval](https://cloud.google.com/iam/docs/keys-create-delete)) + - Pub/Sub Editor + - Storage Object Admin +- Topic the messages should be persisted from +- The Subscription the Persistor will use to pull the messages from +- Indexer topic and subscription +- Resubmission topic +- GCS bucket +- Optional dead-letter topic (used as a last resort in case of unsolvable issues), with a subscription to retain messages +{{< /tab >}} +{{< tab "Azure (Service Bus to Azure Blob Storage)" >}} +## Azure Service Bus to Azure Blob Storage +- Service principal with roles: + - Azure Service Bus Data Sender + - Azure Service Bus Data Receiver + - Storage Blob Data Contributor + - Don't forget to *save* the `CLIENT_ID`, `CLIENT_SECRET` and `TENANT_ID` values when creating the service principal. +- Service Bus Namespace ([Service Bus Namespace Creation](https://learn.microsoft.com/en-us/azure/service-bus-messaging/service-bus-quickstart-portal#create-a-namespace-in-the-azure-portal)) +- Topic the messages should be persisted from +- The Subscription the Persistor will use to pull the messages from +- Indexer topic and subscription +- Resubmission topic +- Azure Storage account +- Azure blob storage container +- Optional dead-letter topic (used as a last resort in case of unsolvable issues), with a subscription to retain messages +{{< /tab >}} +{{< tab "Kafka (to GCS)" >}} +## Kafka to Google Cloud Storage +- An existing Kafka broker. You can create one yourself in a Kubernetes environment via [Strimzi](https://strimzi.io/docs/operators/0.30.0/quickstart.html), should you choose to do so. +- Service account JSON key with the appropriate roles: ([Service Account Creation](https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console), [JSON Key Retrieval](https://cloud.google.com/iam/docs/keys-create-delete)) + - Stackdriver Resource Metadata Writer + - Logs Writer + - Monitoring Metric Writer + - Monitoring Viewer + - Storage Object Admin +- Topic the messages should be persisted from +- Indexer topic +- Resubmission topic +- GCS bucket +- Dead-letter topic (used as a last resort in case of unsolvable issues) +{{< /tab >}} +{{< tab "Kafka (to Azure Blob Storage)" >}} +## Kafka to Azure Blob Storage +- An existing Kafka broker. You can create one yourself in a Kubernetes environment via [Strimzi](https://strimzi.io/docs/operators/0.30.0/quickstart.html), should you choose to do so. +- Service principal with roles: + - Storage Blob Data Contributor + - Don't forget to *save* the `CLIENT_ID`, `CLIENT_SECRET` and `TENANT_ID` values when creating the service principal. +- Topic the messages should be persisted from +- Indexer topic +- Resubmission topic +- Azure Storage account +- Azure blob storage container +- Dead-letter topic (used as a last resort in case of unsolvable issues) +{{< /tab >}} +{{< /tabs >}} + +### Chart Usage + +Each chart has its own configuration settings outlined in its respective subfolder. A `values.yaml` file should be prepared and pass to Helm while performing the installation. Chart can be accessed on the [Helm repository](https://github.com/dataphos/dataphos-helm/tree/main/dataphos-persistor). + +To deploy the `dataphos-persistor` chart, run: + +```bash +helm install persistor ./dataphos-persistor +``` + +This causes the `values.yaml` file to be read from the root directory of the `dataphos-persistor` folder. The `--values flag` may be passed in the call to override this behavior. + +You can also add a `--dry-run` flag that will simply generate the Kubernetes manifests and check if they are valid (note that this requires `kubectl` to be configured against an actual cluster). For general linting of the Helm templates, run `helm lint`. + +## Resubmitter API + +The Resubmitter allows the user to resubmit the stored messages to a destination resubmission topic of their choice. While the Resubmitter allows resubmission based on a number of parameters, in this example, we will resubmit messages based on the **time range** they were ingested onto the platform. + + +### Replaying messages based on the ingestion interval + +To resubmit messages using this endpoint, send a **POST** request to the resubmitter service deployed on your Kubernetes cluster: + +```bash +http://:8081/range/indexer_collection?topic= +``` + +With the `` representing the name of the **destination** topic you wish to replay the messages to. Note that, as a best-practice, this should be different from the original topic messages were pulled from, to ensure message resending does not affect all downstream consumers of the original topic unnecessarily. + +The actual request body contains the information from which topic data were initally received, and what time range +the messages were received. + +In this case, JSON attribute *broker_id* was used. + +```json +{ + "broker_id": "origin_broker_id", + "lb": "0001-01-01T00:00:00Z", // Start Date + "ub": "2023-09-27T14:15:05Z" // End Date +} +``` + +In this case, `origin_broker_id` is the ID of message broker from where messages were initially pulled by the Persistor component. + +The final request is thus: + +```bash +curl -XPOST -H "Content-type: application/json" -d '{ + "broker_id": "origin_broker_id", + "lb": "0001-01-01T00:00:00Z", + "ub": "2021-09-27T14:15:05Z" +}' 'http://:8081/range/?topic=' +``` + + +By following this example, if you resubmit all the messages with the given `origin_broker_id` to the specified `destination_topic_id`, you should get a response that looks as follows: + +```json +{ + "status": 200, + "msg": "resubmission successful", + "summary": { + "indexed_count": 20, + "fetched_count": 20, + "deserialized_count": 20, + "published_count": 20 + }, + "errors": {} +} +``` diff --git a/dataphos-docs/content/persistor/quickstart/pulumi.md b/dataphos-docs/content/persistor/quickstart/pulumi.md new file mode 100644 index 0000000..2750097 --- /dev/null +++ b/dataphos-docs/content/persistor/quickstart/pulumi.md @@ -0,0 +1,230 @@ +--- +title: "Pulumi" +draft: false +weight: 3 +--- + +## Setting Up Your Environment + +### Prerequisites + +Persistor components run in a Kubernetes environment. This quickstart guide will assume that you have +[Python 3](https://www.python.org/downloads/) and [Pulumi](https://www.pulumi.com/docs/install/) tools installed. Pulumi repository can be accessed on the [Pulumi repository](https://github.com/dataphos/dataphos-infra). + +This quickstart guide will assume creating new resources instead of importing existing ones into the active stack. If you wish to import your resources check [Deployment Customization](/persistor/configuration/pulumi). + +Persistor has multiple message broker options and storage options. This quickstart guide will assume that the publishing message broker will be either GCP Pub/Sub, Azure ServiceBus, or Kafka, and for storage options Google Cloud Storage(GCS) or Azure Blob Storage. + + +### Persistor namespace + +The namespace where the components will be deployed is defined in the config file, you don't have to create it by yourself. We will use the namespace `dataphos` in this guide. + +```bash + namespace: dataphos +``` + +### Download the Persistor Helm chart + +The Dataphos Helm charts are located in the [Dataphos Helm Repository](https://github.com/dataphos/dataphos-helm). + +To properly reference the Persistor chart, clone the Helm repository and copy the entire `dataphos-persistor` directory into the `helm_charts` directory of this repository. + +### Install Dependencies + +Create a virtual environment from the project root directory and activate it: + +```bash +py -m venv venv +./venv/Scripts/activate +``` + +Install package dependencies: +```bash +py -m pip install -r ./requirements.txt +``` + +Note: This usually doesn't take long, but can take up to 45 minutes, depending on your setup. + +## Persistor deployment + +Persistor consists of 4 components: **Persistor Core**, **Indexer**, **Indexer API**, and the **Resubmitter**. + +All four are highly configurable, allowing for a multitude of combinations of brokers and blob storage destinations. In this quickstart, we will outline four of the commonly-used ones. For a complete list and detailed configuration options, we suggest viewing the [Configuration](/persistor/configuration/pulumi) page. + +### Cloud provider and stack configuration + +{{< tabs "persistorplatform" >}} +{{< tab "GCP (Pub/Sub to GCS)" >}} + +### Google PubSub to Google Cloud Storage + +Deploy all of the required Persistor components for consuming messages from a Google PubSub topic and storing them in a Google Cloud Storage account. + +Install the Google Cloud SDK and then authorize access with a user account. Next, Pulumi requires default application credentials to interact with your Google Cloud resources, so run auth application-default login command to obtain those credentials: + +```bash +$ gcloud auth application-default login +``` + +### Configure your stack + +You can use a stack configuration template file to quickly deploy and modify common architectures. This repository includes a set of pre-configured templates for different combinations of Dataphos components and cloud providers. Configuration specifics can be found in the Configuration section of this manual. + +To start using a stack template, copy the desired file from the config_templates directory into the project root directory. Next, create a new stack to contain your infrastructure configuration. Make sure to use the name of a pre-configured stack template for your stack. + +```bash +$ pulumi stack init persistor-gcp-pubsub-dev +``` +This will create a new stack named `persistor-gcp-pubsub-dev` in your project and set it as the active stack. + +{{< /tab >}} + +{{< tab "Azure (Service Bus to Azure Blob Storage)" >}} + +### Azure Service Bus to Azure Blob Storage + +Deploy all of the required Persistor components for consuming messages from a Service Bus topic and storing them into an Azure Blob Storage account. + +Log in to the Azure CLI and Pulumi will automatically use your credentials: +```bash +$ az login +``` + +### Configure your stack +You can use a stack configuration template file to quickly deploy and modify common architectures. This repository includes a set of pre-configured templates for different combinations of Dataphos components and cloud providers.Configuration specifics can be found in the Configuration section of this manual. + +To start using a stack template, copy the desired file from the config_templates directory into the project root directory. Next, create a new stack to contain your infrastructure configuration. Make sure to use the name of a pre-configured stack template for your stack. + +```bash +$ pulumi stack init persistor-azure-sb-dev +``` +This will create a new stack named `persistor-azure-sb-dev` in your project and set it as the active stack. + + +{{< /tab >}} +{{< tab "Kafka (to GCS)" >}} + +### Kafka to Google Cloud Storage + +Deploy all of the required Persistor components for consuming messages from a Google PubSub topic and storing them in a Google Cloud Storage account. + +Install the Google Cloud SDK and then authorize access with a user account. Next, Pulumi requires default application credentials to interact with your Google Cloud resources, so run auth application-default login command to obtain those credentials: + +```bash +$ gcloud auth application-default login +``` + +### Configure your stack +You can use a stack configuration template file to quickly deploy and modify common architectures. This repository includes a set of pre-configured templates for different combinations of Dataphos components and cloud providers.Configuration specifics can be found in the Configuration section of this manual. + +To start using a stack template, copy the desired file from the config_templates directory into the project root directory. Next, create a new stack to contain your infrastructure configuration. Make sure to use the name of a pre-configured stack template for your stack. + +```bash +$ pulumi stack init persistor-gcp-kafka-dev +``` +This will create a new stack named `persistor-gcp-kafka-dev` in your project and set it as the active stack. + +{{< /tab >}} +{{< tab "Kafka (to Azure Blob Storage)" >}} + +### Kafka to Azure Blob Storage + +Deploy all of the required Persistor components for consuming messages from a Service Bus topic and storing them into an Azure Blob Storage account. + +Log in to the Azure CLI and Pulumi will automatically use your credentials: +```bash +$ az login +``` + +### Configure your stack +You can use a stack configuration template file to quickly deploy and modify common architectures. This repository includes a set of pre-configured templates for different combinations of Dataphos components and cloud providers.Configuration specifics can be found in the Configuration section of this manual. + +To start using a stack template, copy the desired file from the config_templates directory into the project root directory. Next, create a new stack to contain your infrastructure configuration. Make sure to use the name of a pre-configured stack template for your stack. + +```bash +$ pulumi stack init persistor-azure-kafka-dev +``` +This will create a new stack named `persistor-azure-kafka-dev` in your project and set it as the active stack. + + +{{< /tab >}} +{{< /tabs >}} + +### Deployment + +Preview and deploy infrastructure changes: +```bash +$ pulumi up +``` +Destroy your infrastructure changes: +```bash +$ pulumi destroy +``` + +Following the deployment, the Persistor component will begin automatically pulling data from the configured topic and delivering it to the target storage destination. + +By following the quickstart, the destination will be: + +``` +{BUCKET/CONTAINER_ID}{TOPIC_ID}/{SUBSCRIPTION_ID or CONSUMER_GROUP_ID}/{YEAR}/{MONTH}/{DAY}/{HOUR}/.*avro +``` + +The messages will be stored in batches, in the `.avro` format. + +## Resubmitter API + +The Resubmitter allows the user to resubmit the stored messages to a destination resubmission topic of their choice. While the Resubmitter allows resubmission based on a number of parameters, in this example, we will resubmit messages based on the **time range** they were ingested onto the platform. + + +### Replaying messages based on the ingestion interval + +To resubmit messages using this endpoint, send a **POST** request to the resubmitter service deployed on your Kubernetes cluster: + +```bash +http://:8081/range/indexer_collection?topic= +``` + +With the `` representing the name of the **destination** topic you wish to replay the messages to. Note that, as a best-practice, this should be different from the original topic messages were pulled from, to ensure message resending does not affect all downstream consumers of the original topic unnecessarily. + +The actual request body contains the information from which topic data were initally received, and what time range +the messages were received. + +In this case, JSON attribute *broker_id* was used. + +```json +{ + "broker_id": "origin_broker_id", + "lb": "0001-01-01T00:00:00Z", // Start Date + "ub": "2023-09-27T14:15:05Z" // End Date +} +``` + +In this case, `origin_broker_id` is the ID of message broker from where messages were initially pulled by the Persistor component. + +The final request is thus: + +```bash +curl -XPOST -H "Content-type: application/json" -d '{ + "broker_id": "origin_broker_id", + "lb": "0001-01-01T00:00:00Z", + "ub": "2021-09-27T14:15:05Z" +}' 'http://:8081/range/?topic=' +``` + + +By following this example, if you resubmit all the messages with the given `origin_broker_id` to the specified `destination_topic_id`, you should get a response that looks as follows: + +```json +{ + "status": 200, + "msg": "resubmission successful", + "summary": { + "indexed_count": 20, + "fetched_count": 20, + "deserialized_count": 20, + "published_count": 20 + }, + "errors": {} +} +``` diff --git a/dataphos-docs/content/persistor/quickstart/shell.md b/dataphos-docs/content/persistor/quickstart/shell.md new file mode 100644 index 0000000..ab94aba --- /dev/null +++ b/dataphos-docs/content/persistor/quickstart/shell.md @@ -0,0 +1,324 @@ +--- +title: "Shell" +draft: false +weight: 1 +--- + +## Setting Up Your Environment + +### Prerequisites + +Persistor components run in a Kubernetes environment. This quickstart guide will assume that you have +the ```kubectl``` tool installed and a running Kubernetes cluster on one of the major cloud providers (GCP, Azure) and a +connection with the cluster. The Kubernetes cluster node/nodes should have at least 8 GB of available RAM. + +Persistor has multiple message broker options and storage options. This quickstart guide will assume that the publishing message +broker will be either GCP Pub/Sub, Azure ServiceBus or Kafka, and for storage options Google Cloud Storage(GCS) or Azure Blob Storage. + +{{< tabs "platformconfig" >}} +{{< tab "GCP (Pub/Sub to GCS)" >}} + +## Google PubSub to Google Cloud Storage +- Service account JSON key with the appropriate roles: ([Service Account Creation](https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console), [JSON Key Retrieval](https://cloud.google.com/iam/docs/keys-create-delete)) + - Pub/Sub Editor + - Storage Object Admin +- Topic the messages should be persisted from +- The Subscription the Persistor will use to pull the messages from +- Indexer topic and subscription +- Resubmission topic +- GCS bucket +- Optional dead-letter topic (used as a last resort in case of unsolvable issues), with a subscription to retain messages +{{< /tab >}} +{{< tab "Azure (Service Bus to Azure Blob Storage)" >}} +## Azure Service Bus to Azure Blob Storage +- Service principal with roles: + - Azure Service Bus Data Sender + - Azure Service Bus Data Receiver + - Storage Blob Data Contributor + - Don't forget to *save* the `CLIENT_ID`, `CLIENT_SECRET` and `TENANT_ID` values when creating the service principal. +- Service Bus Namespace ([Service Bus Namespace Creation](https://learn.microsoft.com/en-us/azure/service-bus-messaging/service-bus-quickstart-portal#create-a-namespace-in-the-azure-portal)) +- Topic the messages should be persisted from +- The Subscription the Persistor will use to pull the messages from +- Indexer topic and subscription +- Resubmission topic +- Azure Storage account +- Azure blob storage container +- Optional dead-letter topic (used as a last resort in case of unsolvable issues), with a subscription to retain messages +{{< /tab >}} +{{< tab "Kafka (to GCS)" >}} +## Kafka to Google Cloud Storage +- An existing Kafka broker. You can create one yourself in a Kubernetes environment via [Strimzi](https://strimzi.io/docs/operators/0.30.0/quickstart.html), should you choose to do so. +- Service account JSON key with the appropriate roles: ([Service Account Creation](https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console), [JSON Key Retrieval](https://cloud.google.com/iam/docs/keys-create-delete)) + - Stackdriver Resource Metadata Writer + - Logs Writer + - Monitoring Metric Writer + - Monitoring Viewer + - Storage Object Admin +- Topic the messages should be persisted from +- Indexer topic +- Resubmission topic +- GCS bucket +- Dead-letter topic (used as a last resort in case of unsolvable issues) +{{< /tab >}} +{{< tab "Kafka (to Azure Blob Storage)" >}} +## Kafka to Azure Blob Storage +- An existing Kafka broker. You can create one yourself in a Kubernetes environment via [Strimzi](https://strimzi.io/docs/operators/0.30.0/quickstart.html), should you choose to do so. +- Service principal with roles: + - Storage Blob Data Contributor + - Don't forget to *save* the `CLIENT_ID`, `CLIENT_SECRET` and `TENANT_ID` values when creating the service principal. +- Topic the messages should be persisted from +- Indexer topic +- Resubmission topic +- Azure Storage account +- Azure blob storage container +- Dead-letter topic (used as a last resort in case of unsolvable issues) +{{< /tab >}} +{{< /tabs >}} + +### Create the Persistor namespace + +Before deploying the Persistor, the namespace where the components will be deployed should be created if it +doesn't exist. + +Create the namespace where Persistor will be deployed. We will use the namespace `dataphos` in this guide. + +```bash +kubectl create namespace dataphos +``` + +## Deployment + +Persistor consists of 4 components: **Persistor Core**, **Indexer**, **Indexer API**, the **Resubmitter**. + +All four are highly configurable, allowing for a multitude of combinations of brokers and blob storage destinations. In this quickstart, we will outline four of the most commonly-used ones. For a complete list and detailed configuration options, we suggest viewing the [Configuration](/persistor/configuration) page. + +### Deploy the Persistor + +{{< tabs "persistorplatform" >}} +{{< tab "GCP (Pub/Sub to GCS)" >}} + +## Google PubSub to Google Cloud Storage + +Deploy all of the required Persistor components for consuming messages from a Google PubSub topic and storing them into a Google Cloud Storage account. + +### Arguments + +The required arguments are: + +- The GCP Project ID +- The name of the topic data will be persisted from +- The Persistor Subscription +- The Bucket data will be persisted to +- The dead letter topic to be used in case of unresolvable errors +- The name of the topic indexation metadata will be sent to +- The Indexer Subscription +- The Path to your locally-stored GCP JSON Service Account Credentials + +The script can be found [here](/referenced-scripts/deployment-scripts/persistor/#persistor-gcp). From the content root, to run the script, run the following command: +```bash +# "myProjectID" is the GCP project ID. +# "persistor-topic" is the Topic messages will be pulled form. +# "persistor-sub" is the subscription the Persistor will use to pull the messages from. +# "persistor-bucket" is the name of the GCS bucket the data will be stored to. +# "persistor-dltopic" is the dead letter topic to be used in case of unresolvable errors +# "indexer-topic" is the topic the Indexer metadata will be sent to. +# "indexer-sub" is the subscription the Indexer component will read the metadata from. +# "C:/Users/User/Dekstop/key.json" is the path to the GCP Service Account key file. + +./persistor_gcp.sh "myProjectID" "persistor-topic" "persistor-sub" "persistor-bucket" "persistor-dltopic" "indexer-topic" "indexer-sub" "C:/Users/User/Dekstop/key.json" +``` + +{{< /tab >}} +{{< tab "Azure (Service Bus to Azure Blob Storage)" >}} + +## Azure Service Bus to Azure Blob Storage + +Deploy all of the required Persistor components for consuming messages from a Service Bus topic and storing them into an Azure Blob Storage account. + +### Arguments + +The required arguments are: + +- The `CLIENT_ID` of the Service Principal +- The `CLIENT_SECRET` of the Service Principal +- The `TENANT_ID` of the Service Principal +- The connection string of the namespace the Persistor's target topic is located in +- The name of the topic data will be persisted from +- The Persistor Subscription +- The Azure Storage Account messages will be persisted to +- The main container the messages will be persisted to +- The dead letter topic to be used in case of unresolvable errors +- The connection string of the namespace the Indexer topic is located in +- The name of the topic indexation metadata will be sent to +- The Indexer Subscription + +The script can be found [here](/referenced-scripts/deployment-scripts/persistor/#persistor-azure). From the content root, to run the script, run the following command: + + + +```bash +# "19b725a4-1a39-5fa6-bdd0-7fe992bcf33c" is an Azure CLIENT_ID. +# "38c345b5-1b40-7fb6-acc0-5ab776daf44e" is an Azure CLIENT_SECRET. +# "49d537a6-8a49-5ac7-ffe1-6fe225abe33f" is an Azure TENANT_ID. +# "namespace-conn-per" is the connection string of the Service Bus namespace to persist from. The actual value should be something of the form "Endpoint=sb://per-namespace.servicebus.windows.net/;SharedAccessKeyName=RootManageSharedAccessKey;SharedAccessKey=..." +# "persistor-topic" is the name of the Service Bus topic to persist from. +# "persistor-sub" is the subscription the Persistor will use to pull the messages from. +# "myaccountstorage" is the name of the Azure Storage Account data will be saved to. +# "persistor-container" is the name of the container data will be saved to. +# "persistor-dltopic" is the dead letter topic to be used in case of unresolvable errors +# "namespace-conn-idx" is the connection string of the Service Bus namespace Indexer metadata will be sent to. The actual value should be something of the form "Endpoint=sb://idx-namespace.servicebus.windows.net/;SharedAccessKeyName=RootManageSharedAccessKey;SharedAccessKey=..." +# "indexer-topic" is the topic the Indexer metadata will be sent to. +# "indexer-sub" is the subscription the Indexer component will read the metadata from. + +./persistor_azure.sh "19b725a4-1a39-5fa6-bdd0-7fe992bcf33c" "38c345b5-1b40-7fb6-acc0-5ab776daf44e" "49d537a6-8a49-5ac7-ffe1-6fe225abe33f" "namespace-conn-per" "persistor-topic" "persistor-sub" "myaccountstorage" "persistor-container" "persistor-dltopic" "namespace-conn-idx" "indexer-topic" "indexer-sub" +``` + +{{< /tab >}} +{{< tab "Kafka (to GCS)" >}} + +## Kafka to Google Cloud Storage + +Deploy all of the required Persistor components for consuming messages from a Kafka topic and storing them into a Google Cloud Storage bucket. + +### Arguments + +The required arguments are: + +- The GCP Project ID of the GCS bucket the data will be persisted to +- The address of the Kafka broker the topic data will be persisted from is located (the host if the broker is publicly-exposed, alternatively the Kubernetes DNS name) +- The name of the topic data will be persisted from +- The name of the consumer group the Persistor will use +- The Bucket data will be persisted to +- The dead letter topic to be used in case of unresolvable errors +- The address of the Kafka broker the indexing metadata topic is located in +- The name of the topic indexation metadata will be sent to +- The name of the consumer group the Indexer will use +- The Path to your locally-stored GCP JSON Service Account Credentials + +The script can be found [here](/referenced-scripts/deployment-scripts/persistor/#persistor-kafka-to-gcs). From the content root, to run the script, run the following command: +```bash +# "myProjectID" is the GCP project ID the storage account is located in. +# "[10.20.0.10]" is (one of) the IPs to the Kafka Bootstrap server of the cluster we are persisting from. +# "persistor-topic" is the Topic messages will be pulled form. +# "Persistor" is example of consumer group the Persistor will use. +# "persistor-bucket" is the name of the GCS bucket the data will be stored to. +# "persistor-dltopic" is the dead letter topic to be used in case of unresolvable errors. +# "["10.20.0.10"] is (one of) the IPs to the Kafka Bootstrap server of the cluster the Indexer is located in. +# "indexer-topic" is the topic the Indexer metadata will be sent to. +# "Indexer" is example of consumer group for indexer. +# "C:/Users/User/Dekstop/key.json" is the path to the GCP Service Account key file. + +./persistor_kafka_gcs.sh "myProjectID" "[10.20.0.10]" "persistor-topic" "Persistor" "persistor-bucket" "persistor-dltopic" "[10.20.0.10]" "indexer_topic" "Indexer" "C:/Users/User/Dekstop/key.json" +``` + +{{< /tab >}} +{{< tab "Kafka (to Azure Blob Storage)" >}} + +## Kafka to Azure Blob Storage + +Deploy all of the required Persistor components for consuming messages from a Kafka topic and storing them into an Azure Blob Storage account. + +### Arguments + +The required arguments are: + +- The `CLIENT_ID` of the Service Principal +- The `CLIENT_SECRET` of the Service Principal +- The `TENANT_ID` of the Service Principal +- The address of the Kafka broker the topic data will be persisted from is located (the host if the broker is publicly-exposed, alternatively the Kubernetes DNS name) +- The name of the topic data will be persisted from +- The name of the consumer group the Persistor will use +- The Azure Storage Account messages will be persisted to +- The main container the messages will be persisted to +- The dead letter topic to be used in case of unresolvable errors +- The address of the Kafka broker the indexing metadata topic is located in +- The name of the topic indexation metadata will be sent to +- The name of the consumer group the Indexer will use + +The script can be found [here](/referenced-scripts/deployment-scripts/persistor/#persistor-kafka-to-azure-blog-storage). From the content root, to run the script, run the following command: +```bash +# "19b725a4-1a39-5fa6-bdd0-7fe992bcf33c" is an Azure CLIENT_ID. +# "38c345b5-1b40-7fb6-acc0-5ab776daf44e" is an Azure CLIENT_SECRET. +# "49d537a6-8a49-5ac7-ffe1-6fe225abe33f" is an Azure TENANT_ID. +# "[10.20.0.10]" is (one of) the IPs to the Kafka Bootstrap server of the cluster we are persisting from. +# "persistor-topic" is the Topic messages will be pulled form. +# "Persistor" is example of consumer group the Persistor will use. +# "myaccountstorage" is the name of the Azure Storage Account data will be saved to. +# "persistor-container" is the name of the container data will be saved to. +# "persistor-dltopic" is the dead letter topic to be used in case of unresolvable errors +# "["10.20.0.10"] is (one of) the IPs to the Kafka Bootstrap server of the cluster the Indexer is located in. +# "indexer-topic" is the topic the Indexer metadata will be sent to. +# "Indexer" is example of consumer group for indexer. + +./persistor_kafka_az_blob.sh "19b725a4-1a39-5fa6-bdd0-7fe992bcf33c" "38c345b5-1b40-7fb6-acc0-5ab776daf44e" "49d537a6-8a49-5ac7-ffe1-6fe225abe33f" "[10.20.0.10]" "persistor-topic" "Persistor" "myaccountstorage" "persistor-container" "persistor-dltopic" "[10.20.0.10]" "indexer-topic" "Indexer" +``` + +{{< /tab >}} +{{< /tabs >}} + +Following the deployment, the Persistor component will being automatically pulling data from the configured topic and delivering it to the target storage destination. + +By following the quickstart, the destination will be: + +``` +{BUCKET/CONTAINER_ID}{TOPIC_ID}/{SUBSCRIPTION_ID or CONSUMER_GROUP_ID}/{YEAR}/{MONTH}/{DAY}/{HOUR}/.*avro +``` + +The messages will be stored in batches, in the `.avro` format. + +## Resubmitter API + +The Resubmitter allows the user to resubmit the stored messages to a destination resubmission topic of their choice. While the Resubmitter allows resubmission based on a number of parameters, in this example, we will resubmit messages based on the **time range** they were ingested onto the platform. + + +### Replaying messages based on the ingestion interval + +To resubmit messages using this endpoint, send a **POST** request to the resubmitter service deployed on your Kubernetes cluster: + +```bash +http://:8081/range/indexer_collection?topic= +``` + +With the `` representing the name of the **destination** topic you wish to replay the messages to. Note that, as a best-practice, this should be different from the original topic messages were pulled from, to ensure message resending does not affect all downstream consumers of the original topic unnecessarily. + +The actual request body contains the information from which topic data were initally received, and what time range +the messages were received. + +In this case, JSON attribute *broker_id* was used. + +```json +{ + "broker_id": "origin_broker_id", + "lb": "0001-01-01T00:00:00Z", // Start Date + "ub": "2023-09-27T14:15:05Z" // End Date +} +``` + +In this case, `origin_broker_id` is the ID of message broker from where messages were initially pulled by the Persistor component. + +The final request is thus: + +```bash +curl -XPOST -H "Content-type: application/json" -d '{ + "broker_id": "origin_broker_id", + "lb": "0001-01-01T00:00:00Z", + "ub": "2021-09-27T14:15:05Z" +}' 'http://:8081/range/?topic=' +``` + + +By following this example, if you resubmit all the messages with the given `origin_broker_id` to the specified `destination_topic_id`, you should get a response that looks as follows: + +```json +{ + "status": 200, + "msg": "resubmission successful", + "summary": { + "indexed_count": 20, + "fetched_count": 20, + "deserialized_count": 20, + "published_count": 20 + }, + "errors": {} +} +``` \ No newline at end of file diff --git a/dataphos-docs/content/persistor/usage.md b/dataphos-docs/content/persistor/usage.md new file mode 100644 index 0000000..18e4ce5 --- /dev/null +++ b/dataphos-docs/content/persistor/usage.md @@ -0,0 +1,211 @@ +--- +title: "Usage" +draft: false +weight: 2 +--- + +This page outlines the general usage of the component -- primarily referring to its REST APIs utilized for indexation and resubmission. + +# Persistor Core + +Once the Persistor is successfully deployed, it is ready to ingest data. The ingestion is started by publishing messages to the Persistor's topic. That can be done by using the [Publisher](/publisher), or by creating your publisher for testing purposes. + +You can check the results by checking the files saved on the Google Storage or Azure Blob Storage at any point during the publishing process (assuming you have deployed the Persistor in the `Push` or `Constant Pull` model). + +If you have also deployed the Indexer component, you will find the metadata populated in the Persistor's underlying Indexer database, as well. + +Both Persistor and Indexer expose their metrics on the `:2112/metrics` endpoint. There you can view information about the program runtime, memory and CPU usage, and amounts of processed data. + +# Resubmitter REST API + +The Persistor offers an interactive component - the **Resubmitter API**. It reconstructs the original message and re-sends it to the broker, in the same way as the original message producer would do. + +To do this, Resubmitter connects to Indexer API to fetch the metadata necessary for resubmission, while also connecting to a permanent storage for retrieving payloads, and to a messaging service for publishing reconstructed messages. +The rights to connect to the storage and the messaging service are provided by the service account used, **but there are possible issues when trying to resubmit messages to a Kafka broker using access control lists. It is possible that Kafka refuses connection to an existing topic the Resubmitter and stalls the workflow, but no errors are returned. If the resubmission is done to a topic that doesn't exist, Resubmitter will create it and successfully publish the messages.** + +The Resubmitter API offers multiple methods of resubmitting messages. The core API can be found at: + +```bash +http://:8081//?topic= +``` + +where: + +- `````` is the resubmission method being used +- `````` is the DNS hostname (or IP address) of the Resubmitter API +- `````` is the name of the mongo collection in which the metadata messages will be searched for +- `````` is the message broker topic where the Resubmitter will publish messages + +The API answers all requests with a response containing a Status code and message, but also a summary of how many messages were a part of each process - indexing from the Mongo database, fetching from the storage, deserializing into records if necessary, and publishing to the message broker. + +If Status 500 was returned, an internal error preventing the Resubmitter from processing the request happened. + +If Status 400 was returned, the Resubmitter API successfully started the resubmission pipeline, but some of the message information had been wrong and prevented any messages from being published. + +If an error happened during resubmission, but only on some of the messages, a list of errors and the messages they happened on will also be returned as a part of the response, and the response will have Status 204 Partial resubmission. + +{{< tabs "Resubmitter REST API" >}} +{{< tab "Resubmit by ID" >}} +## Resubmit by ID + +This request resubmits messages with a unique_id within the given IDs. Unique_id consists of broker_id and message_id of the stored message in the data lake: `Unique_id = {broker_id}_{message_id}`. +To resubmit messages with some exact ID, a `POST` request to ```http://:8081/resubmit/?topic=``` must be sent with the request body containing an array of IDs of the messages to be resubmitted. + +```json +{ + "ids": ["msg-topic_2523966771678879", "msg-topic_2523966771678312"] +} +``` + +This can also be done using curl: + +```bash +curl -XPOST -H "Content-type: application/json" -d '{ + "ids": ["msg-topic_2523966771678879", "msg-topic_2523966771678312"] +}' 'http://:8081/resubmit/?topic=' +``` + +An example of a successful response is: + +```json +{ + "status": 200, + "msg": "resubmission succesful", + "summary": { + "indexed_count": 2, + "fetched_count": 2, + "deserialized_count": 2, + "published_count": 2 + + } +} +``` + +If the IDs given in the body don't have corresponding messages in the MongoDB, Status 200 OK without errors is returned, but the counters show that the messages were not successfully found in the database and were not published. + +{{}} +{{< tab "Resubmit by Time Interval" >}} + +## Resubmit by Time Interval + +Range request is used to resubmit all messages from a given broker_id and in the given time interval/range. + +To resubmit all messages from a given topic and in the given time interval, a POST request to ```http://:8081/range/?topic=``` must be sent with the request body containing the broker_id which value is topicID from which the messages were initially ingested. + +Optionally, two other body request parameters can be defined: +- ```lb``` is the lower bound of the time interval (the one further back in time) +- ```ub``` is the upper bound of the time interval (the more recent time) + +If an interval bound parameter is omitted, a default value is used instead. For the lower bound, the default value is 0001/01/01 00:00:00.000000000 UTC, while for the upper bound it is the moment the request had been made. + +An example `/range` request: + +```json +{ + "broker_id": "per-test-topic", + "lb": "0001-01-01T00:00:00Z", + "ub": "2021-09-27T14:15:05Z" +} +``` + +This can also be done using curl: + +```bash +curl -XPOST -H "Content-type: application/json" -d '{ + "broker_id": "per-test-topic", + "lb": "0001-01-01T00:00:00Z", + "ub": "2021-09-27T14:15:05Z" +}' 'http://:8081/range/?topic=' +``` + +{{}} +{{< tab "Resubmit by Custom Query Filter" >}} + +## Resubmit by Custom Query Filter + +The query request is used to resubmit all messages satisfying the custom query filter, i.e. containing exact Persistor-generated metadata values given in the request body. +To use this method, a POST request to ```http://:8081/query/?topic=``` must be sent with the request body containing the filters used. + +The query filters are defined as an array of JSON objects, where each filter is represented by one object and it filters data that satisfies all of its fields. Then, all the filters are combined using the OR logical operation, meaning any data that satisfies at least one full filter is resubmitted. + +JSON fields for a filter should be metadata field names, while their value should either be the exact value to be queried for or a JSON object containing more advanced information for querying. + +Special keywords that can be used in JSONs: + +| Operator | Description | +|:---------|:--------------------------------------------------------------------| +| $eq | Matches values that are equal to a specified value. | +| $gt | Matches values that are greater than a specified value. | +| $gte | Matches values that are greater than or equal to a specified value. | +| $in | Matches any of the values specified in an array. | +| $lt | Matches values that are less than a specified value. | +| $lte | Matches values that are less than or equal to a specified value. | +| $ne | Matches all values that are not equal to a specified value. | +| $nin | Matches none of the values specified in an array. | + + +Example of the request body containing filtering information: + +```json +{ + "filters" : [ + { + "additional_metadata.a" : "x", + "location_position" : { + "$gte": 88, "$lt": 91 + }, + "publish_time": { + "$gte": "2023-02-05 19:20:55.342" + } + }, + { + "additional_metadata.b" : "y" + } + ] +} +``` + +which, by using curl, can be sent as: + +```bash +curl -XPOST -H "Content-type: application/json" -d '{ + "filters" : [ + { + "additional_metadata.a" : "x", + "location_position" : { + "$gte": 88, "$lt": 91 + }, + "publish_time": { + "$gte": "2023-02-05 19:20:55.342" + } + }, + { + "additional_metadata.b" : "y" + } + ] +}' 'http://:8081/query/?topic=' +``` + +Let’s say there are 80 messages with ```"additional_metadata.a" : "x"``` (meaning the message’s additional metadata has to have the field "a", and its value has to be "x"), but only 10 of them have "location_position" in [88, 91), and only 5 of those have "publish_time" after "2023-02-05 19:20:55.342" - those 5 messages would be the result for the first query filter. + +In addition, if there are 100 messages with ```"additional_metadata.b" : "y"```, all of those messages will also be resubmitted. If there is an intersection between those two filter results, those messages are resubmitted only once. + +It is important to note that if timestamps must be given in the format used in the example - `yyyy-MM-dd HH:mm:ss`. + +Furthermore, additional metadata is best defined using single fields and values for those fields, but it can also be defined as a JSON, IE. + +```json +{ + "additional_metadata" : { + "b": "y", + "a": "x" + } +} +``` + +but in that case, all fields must have correct values and their order must be the same as in the query. + +If a query filter key is not a valid Persistor metadata field, an error containing the invalid keys will be returned. + +{{}} +{{}} diff --git a/dataphos-docs/content/persistor/videos-and-blogs.md b/dataphos-docs/content/persistor/videos-and-blogs.md new file mode 100644 index 0000000..be3805f --- /dev/null +++ b/dataphos-docs/content/persistor/videos-and-blogs.md @@ -0,0 +1,18 @@ +--- +title: "Videos and Blogs" +draft: false +weight: 5 +--- +## Blogs +[Persistor business blog](https://www.syntio.net/en/labs-musings/data-orchestration-with-dataphos-persistor-technical-overview?utm_source=DataphosDocs&utm_medium=Documentation&utm_campaign=DataphosDocs) + +[Persistor technical blog](https://www.syntio.net/en/labs-musings/persistor-a-game-changer-in-data-persistence-and-management?utm_source=DataphosDocs&utm_medium=Documentation&utm_campaign=DataphosDocs) + + +Persistor Overview and Demo + +[![Persistor - A Data Platform component by Syntio - Showcase](/persistor_thumbnail_overview.jpg)](https://youtu.be/pB1EXYE88zE "Persistor - A Data Platform component by Syntio - Showcase") + +Persistor Deployment Guide + +[![Persistor - A Data Platform component by Syntio - Deployment Guide](/persistor_thumbnail_deployment.jpg)](https://youtu.be/g83_ZG01q5c "Persistor - A Data Platform component by Syntio - Deployment Guide") diff --git a/dataphos-docs/content/persistor/what-is-the-persistor.md b/dataphos-docs/content/persistor/what-is-the-persistor.md new file mode 100644 index 0000000..be34416 --- /dev/null +++ b/dataphos-docs/content/persistor/what-is-the-persistor.md @@ -0,0 +1,52 @@ +--- +title: "Overview" +draft: false +weight: 1 +--- + +![](/persistor.png) + +**Persistor** is a stateless component that efficiently stores messages collected from a message broker topic onto a well-structured data lake. It automatically performs indexation of messages, allowing for their later retrieval and resubmission. + +It is designed to eliminate the need for re-submission requests from the original publishers (which is often impossible) and to accelerate the building of well-established data pipelines in general -- utilizing the structured storage as a new source of data in itself. + +The data can be stored in batches (in **Avro** format) or as individual messages. + +## Persistor Components + +The Persistor is built as a set of interconnected components. A single Persistor deployment consists of multiple Persistor instances (one per topic), with the indexation and resubmission engines being shared, depending on the use case and planned access control. + +### Persistor Core + +The main component. Subscribes to a topic and sends messages to persistent storage. It is the only non-optional component. + +Supports receiving messages from Google Pub-Sub, Azure Service Bus or Apache Kafka. The available storage options are GCS and ABS + +The codebase is modular and can be easily extended to include multiple storage and broker options. Users generally don't need to interact with it once it is running. + +Enables folder structure based on the publish time of a message (coarse or fine-grained, down to the hour). For instance, we would generally expect to find a structure similar to: + +``` +{BUCKET/CONTAINER ID}/{SUBSCRIPTION_ID}/{YEAR}/{MONTH}/{DAY}/{HOUR}/{blob_name}.avro +``` + +The folder structure also supports the ability of utilizing arbitrary metadata. For instance, if your message features a `schemaID` field in its metadata, you would be able to categorize your messages based on it, making the structure look as follows: + +``` +{BUCKET/CONTAINER ID}/{SUBSCRIPTION_ID}/{SCHEMA_ID}/{YEAR}/{MONTH}/{DAY}/{HOUR}/{blob_name}.avro +``` + +(The positioning of this arbitrary metadata can also be configured. Please see the [Configuration](/persistor/configuration) section for more details.) + +If the **Indexer** component is deployed and enabled, Persistor will formulate a metadata object for each message, containing the information on where each of the received messages is located and their original metadata (headers). + +### Indexer + +The Indexer component communicates via the original Persistor component via another message broker topic. The Indexer is responsible for consuming the received message metadata and storing it in a NoSQL database (in this specific case, Mongo), to be utilized for finding messages during the exploration and resubmission periods. + +The data is exposed via a simple REST **API**. + +### Resubmitter + +The Resubmitter component is built on top of the Indexer component, allowing the user to query the stored metadata to find specific messages (or messages published/received within a specific time period) and trigger their resubmission to a new topic for re-processing. + diff --git a/dataphos-docs/content/publisher/_index.md b/dataphos-docs/content/publisher/_index.md new file mode 100644 index 0000000..ad8844d --- /dev/null +++ b/dataphos-docs/content/publisher/_index.md @@ -0,0 +1,11 @@ +--- +title: "Publisher" +draft: true +weight: 1 +--- + +The Publisher is developed for running a constant flow of ready to digest data packages to your cloud from a database or from an exposed API. + + + +{{< toc-tree >}} \ No newline at end of file diff --git a/dataphos-docs/content/publisher/configuration/_index.md b/dataphos-docs/content/publisher/configuration/_index.md new file mode 100644 index 0000000..8a5155a --- /dev/null +++ b/dataphos-docs/content/publisher/configuration/_index.md @@ -0,0 +1,131 @@ +--- +title: "Deployment Customization" +draft: false +weight: 4 +geekdocCollapseSection: true +--- + + +This page describes Publisher architecture and the deployment using YAML files. Whereas the [Quickstart](/publisher/quickstart) will get you started fairly quickly, this page will explain more precisely the individual components being deployed, how to manually deploy the components yourself and how to configure a DNS certificate for the publicly-exposed components. The following pages go into further detail on how to customize your Kubernetes deployments: +{{< toc-tree >}} + +# Publisher Architecture + +The following diagram gives an overview of Publisher’s deployment process and the end result of said process. + +![Scenario 1: Across columns](/arch.png) + +When deploying Publisher, you deploy the following components: + +* A **Postgres Metadata Database** that will be used to store the configuration information on the individual Publisher runs, known sources and destinations. +* A **database initialization container**. +* The **Manager**, a simple REST web server connected to the Metadata Database. +* The **Scheduler**. It communicates with the Manager. +* The **Java Data Fetcher** used to connect to JDBC-supported database sources, used for performance purposes. It communicates with the sources and the Manager. +* The **Avro Schema Serializer** component, which dynamically infers and defines the Avro Schema based on the user definition provided. +* The **Web UI**. + +## Publisher Deployment on Any Kubernetes {#reference_anykubernetes} + + +## HTTPS setup + +Publisher uses HTTPS for two of its components (Web UI and Manager). To handle HTTPS, two external components must be deployed to the cluster. One to enable external traffic to the cluster, and the other to ensure the external traffic uses HTTPS protocol. Installation is done using public images. + +## Helm + +The deployment of required prerequisites is done over Helm. Helm is the package manager for Kubernetes. Helm can be installed using Chocolatey (Windows) and Apt (Debian/Ubuntu). For other operating systems check the official documentation. + +On Windows: + +```yaml +choco install kubernetes-helm +``` + +On Debian/Ubuntu: + +```bash +curl https://baltocdn.com/helm/signing.asc | gpg --dearmor | sudo tee /usr/share/keyrings/helm.gpg > /dev/null +sudo apt-get install apt-transport-https --yes +echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/helm.gpg] https://baltocdn.com/helm/stable/debian/ all main" | sudo tee /etc/apt/sources.list.d/helm-stable-debian.list +sudo apt-get update +sudo apt-get install helm +``` + +## Nginx Ingress Controller + +To manage external traffic to the cluster, an Ingress Controller solution should be provided. We opted for the Nginx Ingress Controller. + +Install the Nginx Ingress Controller: + +```bash +helm install ingress-nginx ingress-nginx/ingress-nginx --create-namespace --namespace ingress-basic --set controller.service.annotations."service\.beta\.kubernetes\.io/azure-load-balancer-health-probe-request-path"=/healthz +``` + +Generate TLS key and TLS certificate which will be used in secrets (you can use the use v3.conf file from [here](/referenced-scripts/YAML-examples/publisher/#v3-config)). + +```bash +openssl req -newkey rsa:2048 -nodes -keyout tls.key -out tls.csr -config v3.conf + +openssl x509 -req -in tls.csr -signkey tls.key -out tls.crt -days 365 -extensions v3_req -extfile v3.conf +``` + + +## DNS records + +To use the Web UI and Manager components, DNS records need to be created for Ingress Controllers public IP address. + +Extract the Ingress Controller public IP address. + +```bash +kubectl get services nginx-ingress-ingress-nginx-controller --namespace ingress-basic \ + --output jsonpath='{.status.loadBalancer.ingress[0].ip}' +/ +``` +On your domain provider, add two A record sets for the extracted IP address, one for the Manager component and one for the Web UI component. + +In case your organization does not own a registered domain, we recommend GoDaddy as the domain provider. + +## Authentication and encryption + +Select the Postgres database credentials (username and password) you wish to use. The password must contain at least nine characters, of which there are two uppercase letters, two lowercase letters, and two numbers. + +Generate a 32B Encryption key using a random key generator, or use the default one provided in the deployment file, for messages used by the Worker component (ENC_KEY_1). + +Generate a 16B JWT encryption key for secure communication, or use the default one provided in the deployment file (JWT_SECRET). + + +## Deployment + +The YAML files needed for the deployment can be found on: + +* [Publisher Components](/referenced-scripts/YAML-examples/publisher/#publisher-k8s) +* [Secret Files](/referenced-scripts/YAML-examples/publisher/#publisher-secrets) +* [Ingress](/referenced-scripts/YAML-examples/publisher/#publisher-ingress) + +Using the CLI, apply the Publisher deployment files while positioned in the directory with deployment files: + +Apply *secrets.yaml* file: +```bash +kubectl apply -f secrets.yaml +``` +Apply *publisher.yaml* file: + +```bash +kubectl apply -f publisher.yaml +``` +Wait for the pods to become healthy. +Apply *ingress.yaml* file: + +```bash +kubectl apply -f ingress.yaml +``` + + +## Schema Registry Integration + +In order to integrate Publisher with the [Dataphos Schema Registry](/schema_registry), you need to pass the IP address or service name to the variable `SCHEMA_VALIDATION_URL`. If the Schema Registry is in the same cluster, pass the internal DNS name of the service `http://..svc.cluster.local`. After [Dataphos Schema Registry](/schema_registry) has been deployed, you can obtain the Dataphos Schema Registry service information by running: + +```bash +kubectl -n dataphos get svc schema-registry-svc +``` \ No newline at end of file diff --git a/dataphos-docs/content/publisher/configuration/helm.md b/dataphos-docs/content/publisher/configuration/helm.md new file mode 100644 index 0000000..7fcc2fa --- /dev/null +++ b/dataphos-docs/content/publisher/configuration/helm.md @@ -0,0 +1,43 @@ +--- +title: "Helm" +draft: false +weight: 2 +--- + + + +# Configuration in the dataphos-publisher chart {#reference_publisher} + +Below is the list of configurable options in the `values.yaml` file. + +| Variable | Type | Description | Default | +|----------------------------|---------|--------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------| +| namespace | string | The namespace to deploy the Publisher into. | `dataphos` | +| images | object | Docker images to use for each of the individual Publisher sub-components. | | +| images.initdb | string | Initdb Docker image. | `syntioinc/dataphos-publisher-initdb:1.0.0` | +| images.avroSchemaGenerator | string | Avro schema generator image. | `syntioinc/dataphos-publisher-avro-schema-generator:1.0.0` | +| images.scheduler | string | Scheduler image. | `syntioinc/dataphos-publisher-scheduler:1.0.0` | +| images.manager | string | Manager image. | `syntioinc/dataphos-publisher-manager:1.0.0` | +| images.fetcher | string | Fetcher image. | `syntioinc/dataphos-publisher-data-fetcher:1.0.0` | +| images.worker | string | Worker image. | `syntioinc/dataphos-publisher-worker:1.0.0` | +| testMode | boolean | Whether some internal services should be exposed if the Publisher is being deployed as part of debugging/testing (for example, the metadata database). | `false` | +| encryptionKeys | string | A multi-line string written in a key-value pair way. Each pair is a separate key that could be used by the Publisher instance when encrypting data. | `ENC_KEY_1: "D2C0B5865AE141A49816F1FDC110FA5A"` | +| manager | object | The Manager configuration object. | | +| manager.metadataUser | string | Metadata database username. | `someuser` | +| manager.metadataPassword | string | Metadata database password. | `somepassword` | +| webui | object | Web UI configuration object. | | +| webui.endpoint | string | Web UI endpoint (used by the Manager when performing CORS validation) | `http://localhost:9999` | +| schemaRegistry | object | Schema Registry configuration object. | | +| schemaRegistry.url | string | The URL to the Schema Registry (if present) | `http://schema-registry-svc..svc.cluster.local:8080` | + +# Configuration in the dataphos-publisher-webui chart {#reference_publisher_webui} + +Below is the list of configurable options in the `values.yaml` file. + +| Variable | Type | Description | Default | +|--------------|--------|---------------------------------------------------------------------------|---------------------------------------------| +| namespace | string | The namespace to deploy the Publisher into. | `dataphos` | +| images | object | Docker images to use for each of the individual Publisher sub-components. | | +| images.webui | string | Web UI image. | `syntioinc/dataphos-publisher-webui:1.0.0` | +| webui | object | Web UI configuration object. | | + diff --git a/dataphos-docs/content/publisher/configuration/pulumi.md b/dataphos-docs/content/publisher/configuration/pulumi.md new file mode 100644 index 0000000..af976b6 --- /dev/null +++ b/dataphos-docs/content/publisher/configuration/pulumi.md @@ -0,0 +1,192 @@ +--- +title: "Pulumi" +draft: false +weight: 3 +--- +# Configuration + +There are three possible sources of resource configuration values: user configuration in the active stack configuration file, retrieved data from existing resources, and default system-level configuration from the application code. + +User configuration will always take precedence over other configuration sources. If there is no special user configuration for a parameter, the retrieved value from the resource’s previous configuration will be used. If there wasn’t any data retrieved for the resource (as it is being created for the first time), the default system-level configuration value will be used instead. The default values for parameters are listed in the appropriate section of the configuration options. + +If the configuration references an existing cloud resource, the program will retrieve its data from the cloud provider and import the resource into the active stack instead of creating a new one. If the user configuration values specify any additional parameters that differ from the resource configuration while it has not yet been imported into the stack, the deployment will fail. To modify an existing resource’s configuration, import it into the stack first and then redeploy the infrastructure with the desired changes. + +Note: Implicit import of an AKS cluster is currently not supported. To use an existing AKS cluster in your infrastructure, set the AKS cluster's import configuration option to true. + +⚠️ WARNING ⚠️ + +Imported resources will NOT be retained by default when the infrastructure is destroyed. If you want to retain a resource when the infrastructure is destroyed, you need to explicitly set its retain flag to true in the active stack's configuration file. Retained resources will not be deleted from the backing cloud provider, but will be removed from the Pulumi state on a pulumi destroy. Azure resource groups and GCP projects are set to be retained by default and can be deleted manually. Be careful if you choose not to retain them, as destroying them will remove ALL children resources, even the ones created externally. It is recommended to modify these options only if you are using a dedicated empty project/resource group. + + +## Global Configuration + +Below is the shared configuration used between all Persistor types. + +| Variable | Type | Description | Default value | +|--------------------------|---------|----------------------------------------------------------------------------------------------------------------|---------------| +| `namespace` | string | The name of the Kubernetes namespace where Dataphos Helm charts will be deployed to. | `dataphos` | +| `deployPublisher` | boolean | Whether the Publisher and Publisher Web UI Helm charts should be deployed. | `false` | +| `retainResourceGroups` | boolean | Whether Azure resource groups should be retained when the infrastructure is destroyed. | `true` | +| `retainProjects` | boolean | Whether GCP projects should be retained when the infrastructure is destroyed. | `true` | +| `resourceTags` | object | Set of `key:value` tags attached to all Azure resource groups; or set of labels attached to all GCP resources. | | + +## Product Configuration + +The `namespace` and `images` options at the top-level of the Helm chart configurations are set by default and do not need to be manually configured. + +Cloud-specific variables should not be manually configured. Depending on the configured cloud provider, service accounts with appropriate roles are automatically created and their credentials are used to populate these variables. + +| Variable | Type | Description | +|---------------------------------------|--------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `dataphos-publisher` | object | Dataphos Publisher Helm chart configuration. Configuration options are listed in the [publisher configuration]({{< ref "helm#reference_publisher">}}). | +| `dataphos-publisher-webui` | object | Dataphos Publisher Web UI Helm chart configuration. Configuration options are listed in the [publisher webui configuration]({{< ref "helm#reference_publisher_webui">}}). | + + +## Provider configuration options +The variables listed here are required configuration options by their respective Pulumi providers. Your entire infrastructure should reside on a single cloud platform. Deployment across multiple cloud platforms is currently not fully supported. + +{{< tabs "Provider configuration options" >}} + +{{< tab "Azure" >}} +| Variable | Type | Description | Example value | +|-------------------------|--------|------------------------------------|---------------| +| `azure-native:location` | string | The default resource geo-location. | `westeurope` | + +A list of all configuration options for this provider can be found here: +[Azure Native configuration options](https://www.pulumi.com/registry/packages/azure-native/installation-configuration/#configuration-options). + +{{}} + + +{{< tab "GCP" >}} +To successfully deploy resources in a GCP project, the appropriate APIs need to be enabled for that project in the API Console. See: [Enable and disable APIs](https://support.google.com/googleapi/answer/6158841). + +| Variable | Type | Description | Example value | +|---------------|--------|--------------------------|-------------------| +| `gcp:project` | string | The default GCP project. | `syntio-dataphos` | +| `gcp:region` | string | The default region.. | `europe-west2` | +| `gcp:zone` | string | The default zone. | `europe-west2-a` | + +A list of all configuration options for this provider can be found here: +[GCP configuration options](https://www.pulumi.com/registry/packages/gcp/installation-configuration/#configuration-reference). + +{{}} +{{}} + +## Cluster configuration options + +The stack configuration `cluster` object is utilized to configure the Kubernetes cluster necessary to deploy the Helm charts that comprise Dataphos products. + +### Common cluster configuration + +| Variable | Type | Description | +|-----------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `cluster` | object | The object containing the general information on the cluster. | +| `cluster.CLUSTER_ID` | object | The object representing an individual cluster's configuration. | +| `cluster.CLUSTER_ID.type` | string | The type of the managed cluster. Valid values: [`gke`, `aks`]. | +| `cluster.CLUSTER_ID.name` | string | The name of the managed cluster. | +| `cluster.CLUSTER_ID.retain` | boolean | If set to true, resource will be retained when infrastructure is destroyed. Retained resources will not be deleted from the backing cloud provider, but will be removed from the Pulumi state. | + +### Specific cluster configuration + +{{< tabs "Cluster configuration options" >}} + +{{< tab "AKS" >}} +| Variable | Type | Description | Default value | +|----------------------------------------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------| +| `cluster.CLUSTER_ID.import` | boolean | Whether to use an existing AKS cluster instead of creating a new one.
**Note:** AKS clusters imported in this way will be retained on destroy, unless its resource group is not retained. | `false` | +| `cluster.CLUSTER_ID.resourceGroup` | string | The name of the resource group. The name is case insensitive. | | +| `cluster.CLUSTER_ID.sku` | object | The managed cluster SKU. | | +| `cluster.CLUSTER_ID.sku.name` | string | The managed cluster SKU name. | `Basic` | +| `cluster.CLUSTER_ID.sku.tier` | string | The managed cluster SKU tier. | `Free` | +| `cluster.CLUSTER_ID.dnsPrefix` | string | The cluster DNS prefix. This cannot be updated once the Managed Cluster has been created. | | +| `cluster.CLUSTER_ID.agentPoolProfiles` | object | The agent pool properties. | | +| `cluster.CLUSTER_ID.agentPoolProfiles.name` | string | Windows agent pool names must be 6 characters or less. | | +| `cluster.CLUSTER_ID.agentPoolProfiles.count` | integer | Number of agents (VMs) to host docker containers. | `3` | +| `cluster.CLUSTER_ID.agentPoolProfiles.enableAutoScaling` | boolean | Whether to enable auto-scaler. | `false` | +| `cluster.CLUSTER_ID.agentPoolProfiles.minCount` | integer | The minimum number of nodes for auto-scaling. | `1` | +| `cluster.CLUSTER_ID.agentPoolProfiles.maxCount` | integer | The maximum number of nodes for auto-scaling. | `5` | +| `cluster.CLUSTER_ID.agentPoolProfiles.vmSize` | string | VM size availability varies by region. See: [Supported VM sizes](https://docs.microsoft.com/azure/aks/quotas-skus-regions#supported-vm-sizes) | `Standard_DS2_v2` | +| `cluster.CLUSTER_ID.tags` | object | Set of `key:value` tags attached to the AKS Cluster. This will override the global `resourceTags` configuration option for this resource. | | + + +{{}} + +{{< tab "GKE" >}} + +| Variable | Type | Description | Default value | +|----------------------------------------------------------------|-------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `cluster.CLUSTER_ID.projectID` | string | The project ID is a unique identifier for a GCP project. | | +| `cluster.CLUSTER_ID.location` | string | The geo-location where the resource lives. | | +| `cluster.CLUSTER_ID.initialNodeCount` | integer | The number of nodes to create in this cluster's default node pool. | `3` | +| `cluster.CLUSTER_ID.nodeConfigs` | object | Parameters used in creating the default node pool. | | +| `cluster.CLUSTER_ID.nodeConfig.machineType` | string | The name of a Google Compute Engine machine type. | `e2-medium` | +| `cluster.CLUSTER_ID.clusterAutoscalings` | object list | Per-cluster configuration of Node Auto-Provisioning with Cluster Autoscaler to automatically adjust the size of the cluster and create/delete node pools based on the current needs of the cluster's workload. | | +| `cluster.CLUSTER_ID.clusterAutoscalings[0].autoscalingProfile` | string | Lets you choose whether the cluster autoscaler should optimize for resource utilization or resource availability when deciding to remove nodes from a cluster. Valid values: [`BALANCED`, `OPTIMIZE_UTILIZATION`]. | `BALANCED` | +| `cluster.CLUSTER_ID.clusterAutoscalings[0].enabled` | boolean | Whether node auto-provisioning is enabled. | `false` | +| `cluster.CLUSTER_ID.clusterAutoscalings[0].resourceLimits` | object list | Global constraints for machine resources in the cluster. Configuring the cpu and memory types is required if node auto-provisioning is enabled. | resourceLimits:
- resource_type: cpu
  minimum: 1
  maximum: 1
- resource_type: memory
  minimum: 1
  maximum: 1 | +| `cluster.CLUSTER_ID.resourceLabels` | object | Set of `key:value` labels attached to the GKE Cluster. This will override the global `resourceTags` configuration option for this resource. | | + +{{}} +{{}} + +## Broker configuration options +The stack configuration `brokers` object is used to set up the key references to be used by the dataphos components to connect to one or more brokers deemed to be part of the overall platform infrastructure. + +Product configs directly reference brokers by their `BROKER_ID` listed in the broker config. The same applies to `TOPIC_ID` and `SUB_ID` – the keys of those objects are the actual names of the topics and subscriptions used. + +### Common broker configuration + +| Variable | Type | Description | +|--------------------------------------------------------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `brokers` | object | The object containing the general information on the brokers. | +| `brokers.BROKER_ID` | object | The object representing an individual broker's configuration. | +| `brokers.BROKER_ID.type` | string | Denotes the broker's type. Valid values: [`kafka`, `pubsub`, `servicebus`]. | +| `brokers.BROKER_ID.topics` | object | The object containing the general information on the topics. | +| `brokers.BROKER_ID.topics.TOPIC_ID` | object | The object representing an individual topic's configuration. | +| `brokers.BROKER_ID.topics.TOPIC_ID.retain` | boolean | If set to true, resource will be retained when infrastructure is destroyed. Retained resources will not be deleted from the backing cloud provider, but will be removed from the Pulumi state. | +| `brokers.BROKER_ID.topics.TOPIC_ID.subscriptions` | object | The object containing topic subscription (consumer group) configuration. | +| `brokers.BROKER_ID.topics.TOPIC_ID.subscriptions.SUBSCRIPTION_ID` | object | The object representing an individual topic subscription's configuration. | +| `brokers.BROKER_ID.topics.TOPIC_ID.subscriptions.SUBSCRIPTION_ID.retain` | boolean | If set to true, resource will be retained when infrastructure is destroyed. Retained resources will not be deleted from the backing cloud provider, but will be removed from the Pulumi state. | + +The Azure storage account type. Valid values: [`Storage`, `StorageV2`, `BlobStorage`, `BlockBlobStorage`, `FileStorage`]. The default and recommended value is `BlockBlobStorage`. + +### Specific broker configuration + +{{< tabs Broker configuration options >}} +{{< tab "Azure Service Bus" >}} +| Variable | Type | Description | +|-----------------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `brokers.BROKER_ID.azsbNamespace` | string | The Azure Service Bus namespace name. | +| `brokers.BROKER_ID.resourceGroup` | string | The Azure Service Bus resource group name. | +| `brokers.BROKER_ID.sku` | object | The Azure Service Bus namespace SKU properties. | +| `brokers.BROKER_ID.sku.name` | string | Name of this SKU. Valid values: [`BASIC`, `STANDARD`, `PREMIUM`]. Default value is `STANDARD`. | +| `brokers.BROKER_ID.sku.tier` | string | The billing tier of this SKU. [`BASIC`, `STANDARD`, `PREMIUM`]. Default value is `STANDARD`. | +| `brokers.BROKER_ID.sku.capacity` | integer | The specified messaging units for the tier. For Premium tier, valid capacities are 1, 2 and 4. | +| `brokers.BROKER_ID.tags` | object | Set of `key:value` tags attached to the Azure Service Bus namespace. This will override the global `resourceTags` configuration option for this resource. | +| `brokers.BROKER_ID.retain` | boolean | If set to true, the Azure Service Bus namespace will be retained when infrastructure is destroyed. Retained resources will not be deleted from the backing cloud provider, but will be removed from the Pulumi state. | + +{{}} + + +{{< tab "Google Cloud Pub/Sub" >}} +| Variable | Type | Description | +|--------------------------------------------------------------------------|--------|------------------------------------------------------------------------------------------------------------------------------------------------------| +| `brokers.BROKER_ID.projectID` | string | The GCP project ID. | +| `brokers.BROKER_ID.topics.TOPIC_ID.labels` | object | Set of `key:value` labels attached to the Pub/Sub topic. This will override the global `resourceTags` configuration option for this resource. | +| `brokers.BROKER_ID.topics.TOPIC_ID.subscriptions.SUBSCRIPTION_ID.labels` | object | Set of `key:value` labels attached to the Pub/Sub subscription. This will override the global `resourceTags` configuration option for this resource. | + +{{}} + +{{< tab "Kafka" >}} +| Variable | Type | Description | Default value | +|------------------------------------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------| +| `brokers.BROKER_ID.brokerAddr` | string | The Kafka bootstrap server address. Optional. If omitted or empty, a new Strimzi Kafka cluster operator and cluster will be deployed with default settings. | | +| `brokers.BROKER_ID.clusterName` | string | The name of the Strimzi Kafka cluster custom Kubernetes resource. | `kafka-cluster` | +| `brokers.BROKER_ID.clusterNamespace` | string | The Kubernetes namespace where the cluster will be deployed. | `kafka-cluster` | +| `brokers.BROKER_ID.topics.TOPIC_ID.partitions` | integer | Number of partitions for a specific topic. | `3` | +| `brokers.BROKER_ID.topics.TOPIC_ID.replicas` | integer | Number of replicas for a specific topic. | `1` | + +{{}} +{{}} + diff --git a/dataphos-docs/content/publisher/configuration/shell.md b/dataphos-docs/content/publisher/configuration/shell.md new file mode 100644 index 0000000..7f39ee7 --- /dev/null +++ b/dataphos-docs/content/publisher/configuration/shell.md @@ -0,0 +1,33 @@ +--- +title: "Shell" +draft: false +weight: 1 +--- + + +## Parameters in YAML files needed for any Publisher deployment + +|Resource |Parameter Name |Description |Default | +|---|---|---|---| +|publisher-postgres-secret |POSTGRES_USER |Metadata database username |publisher | +|publisher-metadata-secret |METADATA_USERNAME |Metadata database username |publisher | +|publisher-postgres-secret |POSTGRES_PASSWORD |Metadata database password |samplePassworD1212 | +|publisher-metadata-secret |METADATA_PASSWORD |Metadata database password |samplePassworD1212 | +|publisher-postgres-secret |POSTGRES_DB | Metadata database name | publisher | +|encryption-keys |.stringData."keys.yaml" |Encryption keys for messages used by Worker component (32 bytes needed)| D2C0B5865AE141A49816F1FDC110FA5A| +|publisher-manager-secret |JWT_SECRET |JWT encryption key for secure communication between Manager and WebUI (16 bytes needed)|SuperSecretPass! | +|publisher-manager-ingress |host |Manager domain name | | +|publisher-webui-config |data."server.properties".window. MANAGER_ENDPOINT|Manager domain | | +|publisher-webui-ingress |host | Web UI domain name | | +|publisher-manager-config |WEB_UI |Web UI domain | | +|publisher-scheduler-config |SCHEMA_VALIDATION_URL |Schema Registry public URL or local Kubernetes service IP address | | + +In addition, the YAML file for deployment to GCP (utilizing the native GCP networking resources) requires these four additional parameters. + +|Resource Name |Parameter Name |Description | +|---|---|---| +|publisher-manager-ingress |kubernetes.io/ingress.global-static-ip-name |Manager ingress static IP address name | +|publisher-manager-ingress |ingress.gcp.kubernetes.io/pre-shared-cert |Manager Google managed certificate name | +|publisher-webui-ingress |kubernetes.io/ingress.global-static-ip-name |Web UI ingress static IP address | +|publisher-webui-ingress |ingress.gcp.kubernetes.io/pre-shared-cert | Web UI Google managed certificate name | +|pubsub-key | "key.json" |Base64 encoded PubSub key | diff --git a/dataphos-docs/content/publisher/quickstart/_index.md b/dataphos-docs/content/publisher/quickstart/_index.md new file mode 100644 index 0000000..7325cc4 --- /dev/null +++ b/dataphos-docs/content/publisher/quickstart/_index.md @@ -0,0 +1,12 @@ +--- +title: "Quickstart" +draft: false +weight: 3 +geekdocCollapseSection: true +--- + +There are 3 options for deploying dataphos components, including the Publisher: +{{< toc-tree >}} + +The quickstart guides will get you a working Publisher deployment as quickly as possible. Use any of the three deployment options and follow the guide. +The [Deployment Customization](/publisher/configuration) contains a detailed overview of configuration parameters if you wish to customize the configuration according to your requirements. diff --git a/dataphos-docs/content/publisher/quickstart/helm.md b/dataphos-docs/content/publisher/quickstart/helm.md new file mode 100644 index 0000000..a78f4fd --- /dev/null +++ b/dataphos-docs/content/publisher/quickstart/helm.md @@ -0,0 +1,308 @@ +--- +title: "Helm" +draft: false +weight: 2 +--- + +## Setting Up Your Environment + +### Prerequisites + +This quickstart guide will assume that you have +[Helm](https://helm.sh/) installed. +If you happen to be using VS Code make sure to have the Kubernetes and Helm extensions installed to make life a little easier for you. Helm repository can be accessed on the [Helm repository](https://github.com/dataphos/dataphos-helm). + +Resources that are used must be running before the deployment. Publishers components run in a Kubernetes environment. This quickstart guide will assume that you have a running Kubernetes cluster on one of the following cloud providers (GCP, Azure). + +Publisher has multiple publishing destination options. This quickstart guide will assume that you want to publish data to **GCP Pub/Sub**, and that you have created a Pub/Sub topic and +a service account JSON key with the appropriate role (Pub/Sub Publisher) to do so. + +### Set Up the example source database + +Publisher has multiple data source options. This quickstart guide will use a mock Postgres database with mock invoice data as data ingestion source. The database will be deployed as a Kubernetes StatefulSet resource on +a separate namespace ("publisher-source") of the Kubernetes cluster. + +**Deploy and initialize** the Postgres database when applying the following YAML file: + +```bash +kubectl apply -f /referenced-scripts/YAML-examples/publisher/#publisher-postgresql-deployment +``` +The Postgres image, `syntioinc/dataphos-publisher-source-example:v0.0.2`, contains SQL scripts that create the "demo_invoices" table and insert invoice records upon creation. + +The database credentials are defined within the "publisher-postgres-source-secret" Kubernetes secret in the provided YAML. + +The "invoices" database can be accessed using a database client with "demo_user" as the username and "demo_password" as the password. + +### Create the Publisher namespace + +Create the namespace where Publisher will be deployed. We will use namespace `dataphos` in this quickstart guide. + +```bash +kubectl create namespace dataphos +``` +### Cloud provider configuration +{{< tabs "platformconfig" >}} +{{< tab "Azure" >}} +## HTTPS setup + +Publisher uses HTTPS for two of its components (Web UI and Manager). To handle HTTPS, two external components must be deployed to the cluster. One to enable external traffic to the cluster, and the other to ensure the external traffic uses HTTPS protocol. Installation is done using public images. + +## Helm + +The deployment of required prerequisites is done over Helm. Helm is the package manager for Kubernetes. Helm can be installed using Chocolatey (Windows) and Apt (Debian/Ubuntu). For other operating systems check the official documentation. + +On Windows: + +```yaml +choco install kubernetes-helm +``` + +On Debian/Ubuntu: + +```bash +curl https://baltocdn.com/helm/signing.asc | gpg --dearmor | sudo tee /usr/share/keyrings/helm.gpg > /dev/null +sudo apt-get install apt-transport-https --yes +echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/helm.gpg] https://baltocdn.com/helm/stable/debian/ all main" | sudo tee /etc/apt/sources.list.d/helm-stable-debian.list +sudo apt-get update +sudo apt-get install helm +``` + +## Nginx Ingress Controller + +To manage external traffic to the cluster, an Ingress Controller solution should be provided. We opted for the Nginx Ingress Controller. + +Install the Nginx Ingress Controller: + +```bash +helm install ingress-nginx ingress-nginx/ingress-nginx --create-namespace --namespace ingress-basic --set controller.service.annotations."service\.beta\.kubernetes\.io/azure-load-balancer-health-probe-request-path"=/healthz +``` + +Generate TLS key and TLS certificate which will be used in secrets (you can use the use v3.conf file from [here](/referenced-scripts/YAML-examples/publisher/#v3-config)). + +```bash +openssl req -newkey rsa:2048 -nodes -keyout tls.key -out tls.csr -config v3.conf + +openssl x509 -req -in tls.csr -signkey tls.key -out tls.crt -days 365 -extensions v3_req -extfile v3.conf +``` + + +## DNS records + +To use the Web UI and Manager components, DNS records need to be created for Ingress Controllers public IP address. + +Extract the Ingress Controller public IP address. + +```bash +kubectl get services nginx-ingress-ingress-nginx-controller --namespace ingress-basic \ + --output jsonpath='{.status.loadBalancer.ingress[0].ip}' +/ +``` +On your domain provider, add two A record sets for the extracted IP address, one for the Manager component and one for the Web UI component. + +In case your organization does not own a registered domain, we recommend GoDaddy as the domain provider + +{{}} +{{< tab "GCP" >}} + +## HTTPS setup + +Publisher uses HTTPS for two of its components (Web UI and Manager). It requires a predefined static IP address, domain name and certificate for both components inside the GCP project where Publisher is deployed for them to work properly. + +**1. Reserve static external IP addresses** + +With your working project selected on the GCP Console, navigate to VPC Network → IP addresses and click on the RESERVE EXTERNAL STATIC ADDRESS button in the top header. Create a Global IPv4 address named *publisher-manager-ip*. Repeat this procedure to create another address named *publisher-webui-ip*. Copy these IP addresses to use them in the next step. + +Alternatively, run the following gcloud commands in a console window: + +```bash +gcloud compute addresses create publisher-manager-ip --project= --global +gcloud compute addresses create publisher-webui-ip --project= --global +gcloud compute addresses describe publisher-manager-ip --project= --global +gcloud compute addresses describe publisher-webui-ip --project= --global +``` +Replace ** with the name of your project. + + +**2. Create DNS records** + +To create DNS records, you need to have a domain registered with a domain provider. If you don’t have a domain, you can register one with a domain registrar, e.g. GoDaddy. +We will use Cloud DNS to manage DNS records in your domain. If you choose to skip this step, you need to create these records directly in your domain hosting service. + +Navigate to Network services → Cloud DNS and create a new public zone or select an existing one in any project. We will use *myzone.com* as the assumed DNS name in the rest of these instructions. Click on the NS type record created with the zone. Copy the four nameserver names, e.g. ns-cloud-x.googledomains.com. Go to your domain registrar and replace your default nameservers with the ones copied from the created Cloud DNS zone. After the change is propagated, all records created in the Cloud DNS zone will be created in the domain registrar. + +Click on the ADD RECORD SET button in the Zone details page of the managed zone that you want to add the record to. Create a DNS record with the *publisher-manager* subdomain in the DNS name field, using the Manager IP address you created previously. Repeat this procedure for the *publisher-webui* DNS record using the WebUI IP created in the previous step. Copy the full DNS names for the next step. + +Alternatively, run the following gcloud commands in a console window: + +```bash +gcloud dns --project= record-sets create publisher-manager.myzone.com. --zone="myzone" --type="A" --ttl="300" --rrdatas= +gcloud dns --project= record-sets create publisher-webui.myzone.com. --zone="myzone" --type="A" --ttl="300" --rrdatas= +``` +Replace ``, ``, `` with the appropriate values and use your own zone and DNS name instead of *myzone*. + +**3. Create a Google-managed SSL certificate** + +With your working project selected on the GCP Console, navigate to Network services → Load balancing. The default page view doesn’t enable you to edit certificates, so scroll to the bottom of the page and click the “load balancing link components view” to switch the view to display the load balancing resources. Select the CERTIFICATES tab and click on CREATE SSL CERTIFICATE. Create a Google-managed certificate named *publisher-manager-cert* using the Manager DNS name (*publisher-manager.myzone.com*) in the Domain field. Repeat this procedure for the *publisher-webui-cert* using the WebUI DNS name (*publisher-webui.myzone.com*) in the Domain field. + +Alternatively, run the following gcloud commands in a console window: + +```bash +gcloud beta compute ssl-certificates create publisher-manager-cert --project= --global --domains=publisher-manager.myzone.com +gcloud beta compute ssl-certificates create publisher-webui-cert --project= --global --domains=publisher-webui.myzone.com +``` +Replace ` with the name of your project and use the previously created DNS record values for the domain values. +{{}} +{{}} + +### Create the GCP Service Account Kubernetes secret + +Publisher requires that the GCP Service Account key for a service account with the appropriate role is added as a Kubernetes secret on the cluster. + +Having assumed you have previously created the GCP service account, **download** the JSON containing the service account information. Position yourself in the directory containing your Pub/Sub key file and create the Kubernetes secret using the following command. + +```bash +kubectl create secret generic pubsub-key --from-file=key.json=.json -n dataphos +``` +### Authentication and encryption + +Select the Postgres database credentials (username and password) you wish to use. The password must contain at least nine characters, of which there are two uppercase letters, two lowercase letters, and two numbers. + +Generate a 32B Encryption key using a random key generator, or use the default one provided in the deployment file, for messages used by the Worker component (ENC_KEY_1). + +Generate a 16B JWT encryption key for secure communication, or use the default one provided in the deployment file (JWT_SECRET). + +### Chart Usage + +Each chart has its own configuration settings outlined in its respective subfolder. Charts can be accessed on the [Helm repository](https://github.com/dataphos/dataphos-helm/tree/master/dataphos-publisher). A values.yaml file should be prepared and pass to Helm while performing the installation. The Web UI component is separated from the publisher in the extra chart. + +To deploy the `dataphos-publisher` and `dataphos-publisher-webui` chart, run: + +```bash +helm install persistor ./dataphos-publisher +helm install persistor ./dataphos-publisher-webui +``` + +This causes the `values.yaml` file to be read from the root directory of the `dataphos-publisher` and the `dataphos-publisher-webui` folder. The `--values flag` may be passed in the call to override this behavior. + +You can also add a `--dry-run` flag that will simply generate the Kubernetes manifests and check if they are valid (note that this requires `kubectl` to be configured against an actual cluster). For general linting of the Helm templates, run `helm lint`. + +## Start the Publisher Web UI + +Following the deployment, you can connect to the Publisher via its WebUI. + +To login use the admin username `publisher_admin` with the password `Adm!n`. + +To start a Publisher instance, Publisher configuration files should be added through the Web CLI. + +Access the Web UI by its public IP address and open the Web CLI tab. + +To get the Web UI IP address run the following command. + +```bash +kubectl get services publisher-webui --namespace dataphos \ + --output jsonpath='{.status.loadBalancer.ingress[0].ip}' +/ +``` + +To access the Web UI, paste the Web UI IP address in your web browser and specify port 8080, e.g. `http://1.1.1.1:8080`. + +### Starting a Publisher Instance section + +First, the source configuration should be created. The source database will be accessed by its public IP address. + +To get the source database IP address run the following command. + +```bash +kubectl get services publisher-postgres-source --namespace publisher-source \ + --output jsonpath='{.status.loadBalancer.ingress[0].ip}' +/ +``` + +In the UI, navigate to the **WebCLI** tab and load the following YAML file as a **Source**. + +Add the following source configuration for the Publisher to connect to the "invoices" database we created. + +```yaml +sourceName: publisher-postgres-source +sourceType: Postgres +host: +port: 5432 +databaseName: invoices +username: demo_user +password: demo_password +``` + +Still within the **WebCLI** tab, load the following YAML file as a **Destination**. + +Add the following destination configuration for your Pub/Sub topic. Keys with brackets should be replaced with your values. + +```yaml +destinationName: publisher-pubsub-destination +destinationType: PubSub +parameters: + ProjectID: + TopicID: +``` + +Finally, load the following instance configuration that ingests data from the invoice source, forms business objects according to the definition, and publishes messages to Pub/Sub. + +```yaml +publisherName: publisher-demo +sourceName: publisher-postgres-source +destinationName: publisher-pubsub-destination +serializationType: Avro +encryptionEnabled: false +businessObject: + description: "Demo Publisher - invoices for client" + objectGroup: "invoices-client" + additionalMetadata: + organization: Syntio + definition: + - client_info: + - client_id + - client_username + - client_location + - invoice_info: + - invoice_id + - creation_date + - due_date + - fully_paid_date + - invoice_items: + - invoice_item_id + - quantity + - total_cost + groupElements: + - client_id + - invoice_id + arrayElements: + - invoice_items + keyElements: + - client_id + - invoice_id + idElements: + - client_id + query: | + SELECT + invoice_id, + client_id, + client_username, + client_location, + creation_date, + due_date, + fully_paid_date, + invoice_item_id, + quantity, + total_cost, + billing_item_id + FROM demo_invoices + WHERE + creation_date >= to_timestamp({{ .FetchFrom }},'YYYY-MM-DD HH24:MI:SS') + AND + creation_date < to_timestamp({{ .FetchTo }}, 'YYYY-MM-DD HH24:MI:SS'); +fetcherConfig: + fetchingThreadsNO: 3 + queryIncrementType: HOUR + queryIncrementValue: 12 + initialFetchValue: 2020-01-01 00:20:00.000 + useNativeDriver: true +``` diff --git a/dataphos-docs/content/publisher/quickstart/pulumi.md b/dataphos-docs/content/publisher/quickstart/pulumi.md new file mode 100644 index 0000000..16a7546 --- /dev/null +++ b/dataphos-docs/content/publisher/quickstart/pulumi.md @@ -0,0 +1,529 @@ +--- +title: "Pulumi" +draft: false +weight: 3 +--- + +## Setting up your environment + +### Prerequisites + +Publisher's components run in a Kubernetes environment. This quickstart guide will assume that you have +[Python 3](https://www.python.org/downloads/) and [Pulumi](https://www.pulumi.com/docs/install/) tools installed. Pulumi repository can be accessed on the [Pulumi repository](https://github.com/dataphos/dataphos-infra). + +This quickstart guide will assume creating new resources instead of importing existing ones into the active stack. If you wish to import your own resources, check [Deployment Customization](/publisher/configuration/pulumi). + +### Example source database + +Publisher has multiple data source options. This quickstart guide will use a mock Postgres database with mock invoice data as a data ingestion source. The database is deployed as a Kubernetes StatefulSet resource using Pulumi. + +The database credentials are defined as environment variables within the container inside the postgres template. + +The "invoices" database can be accessed using a database client with "demo_user" as the username and "demo_password" as the password. + + +### Publisher namespace + +The namespace where the components will be deployed is defined in the config file, you don't have to create it yourself. We will use the namespace `dataphos` in this guide. + +```bash + namespace: dataphos +``` + +### Download the Publisher Helm charts + +The Dataphos Helm charts are located in the [Dataphos Helm Repository](https://github.com/dataphos/dataphos-helm). + +To properly reference the Publisher charts, clone the Helm repository and copy the entire `dataphos-publisher` and `dataphos-publisher-webui` directories into the `helm_charts` directory of this repository. + +### Install Dependencies + +Create a virtual environment from the project root directory and activate it: + +```bash +py -m venv venv +./venv/Scripts/activate +``` + +Install package dependencies: +```bash +py -m pip install -r ./requirements.txt +``` + +Note: This usually doesn't take long, but can take up to 45 minutes, depending on your setup. + +## Publisher deployment + +### Cloud provider and stack configuration + + +{{< tabs "deployment" >}} +{{< tab "GCP-Kafka" >}} +### GCP-Kafka + +Deploy all of the required Publisher components for publishing messages to the Kafka topic. + +Install the Google Cloud SDK and then authorize access with a user account. Next, Pulumi requires default application credentials to interact with your Google Cloud resources, so run auth application-default login command to obtain those credentials: + +```bash +$ gcloud auth application-default login +``` + +### Configure your stack + +You can use a stack configuration template file to quickly deploy and modify common architectures. This repository includes a set of pre-configured templates for different combinations of Dataphos components and cloud providers. Configuration specifics can be found in the Configuration section of this manual. + +To start using a stack template, copy the desired file from the config_templates directory into the project root directory. Next, create a new stack to contain your infrastructure configuration. Make sure to use the name of a a pre-configured stack template for your stack. + +```bash +$ pulumi stack init publisher-gcp-kafka-dev +``` +This will create a new stack named `publisher-gcp-kafka-dev` in your project and set it as the active stack. + + +## HTTPS setup + +Publisher uses HTTPS for two of its components (Web UI and Manager). It requires a predefined static IP address, domain name and certificate for both components inside the GCP project where Publisher is deployed for them to work properly. + +**1. Reserve static external IP addresses** + +With your working project selected on the GCP Console, navigate to VPC Network → IP addresses and click on the RESERVE EXTERNAL STATIC ADDRESS button in the top header. Create a Global IPv4 address named *publisher-manager-ip*. Repeat this procedure to create another address named *publisher-webui-ip*. Copy these IP addresses to use them in the next step. + +Alternatively, run the following gcloud commands in a console window: + +```bash +gcloud compute addresses create publisher-manager-ip --project= --global +gcloud compute addresses create publisher-webui-ip --project= --global +gcloud compute addresses describe publisher-manager-ip --project= --global +gcloud compute addresses describe publisher-webui-ip --project= --global +``` +Replace ** with the name of your project. + + +**2. Create DNS records** + +To create DNS records, you need to have a domain registered with a domain provider. If you don’t have a domain, you can register one with a domain registrar, e.g. GoDaddy. +We will use Cloud DNS to manage DNS records in your domain. If you choose to skip this step, you need to create these records directly in your domain hosting service. + +Navigate to Network services → Cloud DNS and create a new public zone or select an existing one in any project. We will use *myzone.com* as the assumed DNS name in the rest of these instructions. Click on the NS type record created with the zone. Copy the four nameserver names, e.g. ns-cloud-x.googledomains.com. Go to your domain registrar and replace your default nameservers with the ones copied from the created Cloud DNS zone. After the change is propagated, all records created in the Cloud DNS zone will be created in the domain registrar. + +Click on the ADD RECORD SET button in the Zone details page of the managed zone that you want to add the record to. Create a DNS record with the *publisher-manager* subdomain in the DNS name field, using the Manager IP address you created previously. Repeat this procedure for the *publisher-webui* DNS record using the WebUI IP created in the previous step. Copy the full DNS names for the next step. + +Alternatively, run the following gcloud commands in a console window: + +```bash +gcloud dns --project= record-sets create publisher-manager.myzone.com. --zone="myzone" --type="A" --ttl="300" --rrdatas= +gcloud dns --project= record-sets create publisher-webui.myzone.com. --zone="myzone" --type="A" --ttl="300" --rrdatas= +``` +Replace ``, ``, `` with the appropriate values and use your own zone and DNS name instead of *myzone*. + +**3. Create a Google-managed SSL certificate** + +With your working project selected on the GCP Console, navigate to Network services → Load balancing. The default page view doesn’t enable you to edit certificates, so scroll to the bottom of the page and click the “load balancing link components view” to switch the view to display the load balancing resources. Select the CERTIFICATES tab and click on CREATE SSL CERTIFICATE. Create a Google-managed certificate named *publisher-manager-cert* using the Manager DNS name (*publisher-manager.myzone.com*) in the Domain field. Repeat this procedure for the *publisher-webui-cert* using the WebUI DNS name (*publisher-webui.myzone.com*) in the Domain field. + +Alternatively, run the following gcloud commands in a console window: + +```bash +gcloud beta compute ssl-certificates create publisher-manager-cert --project= --global --domains=publisher-manager.myzone.com +gcloud beta compute ssl-certificates create publisher-webui-cert --project= --global --domains=publisher-webui.myzone.com +``` +Replace ` with the name of your project and use the previously created DNS record values for the domain values. + +## Authentication and encryption +Select Postgres database credentials (username and password) you wish to use. The password must contain at least nine characters, of which there are two uppercase letters, two lowercase letters, and two numbers. + +Generate a 32B Encryption key using a random key generator, or use the default one provided in the deployment file, for messages used by the Worker component (ENC_KEY_1). + +Generate a 16B JWT encryption key for secure communication, or use the default one provided in the deployment file (JWT_SECRET). + +{{}} +{{< tab "GCP-PubSub" >}} + +### GCP-PubSub + +Deploy all of the required Publisher components for publishing messages to the PubSub topic. + +Install the Google Cloud SDK and then authorize access with a user account. Next, Pulumi requires default application credentials to interact with your Google Cloud resources, so run auth application-default login command to obtain those credentials: + +```bash +$ gcloud auth application-default login +``` + +### Configure your stack + +You can use a stack configuration template file to quickly deploy and modify common architectures. This repository includes a set of pre-configured templates for different combinations of Dataphos components and cloud providers.Configuration specifics can be found in the Configuration section of this manual. + +To start using a stack template, copy the desired file from the config_templates directory into the project root directory. Next, create a new stack to contain your infrastructure configuration. Make sure to use the name of a a pre-configured stack template for your stack. + +```bash +$ pulumi stack init publisher-gcp-pubsub-dev +``` +This will create a new stack named `publisher-gcp-pubsub-dev` in your project and set it as the active stack. + + +## HTTPS setup + +Publisher uses HTTPS for two of its components (Web UI and Manager). It requires a predefined static IP address, domain name and certificate for both components inside the GCP project where Publisher is deployed for them to work properly. + +**1. Reserve static external IP addresses** + +With your working project selected on the GCP Console, navigate to VPC Network → IP addresses and click on the RESERVE EXTERNAL STATIC ADDRESS button in the top header. Create a Global IPv4 address named *publisher-manager-ip*. Repeat this procedure to create another address named *publisher-webui-ip*. Copy these IP addresses to use them in the next step. + +Alternatively, run the following gcloud commands in a console window: + +```bash +gcloud compute addresses create publisher-manager-ip --project= --global +gcloud compute addresses create publisher-webui-ip --project= --global +gcloud compute addresses describe publisher-manager-ip --project= --global +gcloud compute addresses describe publisher-webui-ip --project= --global +``` +Replace ** with the name of your project. + + +**2. Create DNS records** + +To create DNS records, you need to have a domain registered with a domain provider. If you don’t have a domain, you can register one with a domain registrar, e.g. GoDaddy. +We will use Cloud DNS to manage DNS records in your domain. If you choose to skip this step, you need to create these records directly in your domain hosting service. + +Navigate to Network services → Cloud DNS and create a new public zone or select an existing one in any project. We will use *myzone.com* as the assumed DNS name in the rest of these instructions. Click on the NS type record created with the zone. Copy the four nameserver names, e.g. ns-cloud-x.googledomains.com. Go to your domain registrar and replace your default nameservers with the ones copied from the created Cloud DNS zone. After the change is propagated, all records created in the Cloud DNS zone will be created in the domain registrar. + +Click on the ADD RECORD SET button in the Zone details page of the managed zone that you want to add the record to. Create a DNS record with the *publisher-manager* subdomain in the DNS name field, using the Manager IP address you created previously. Repeat this procedure for the *publisher-webui* DNS record using the WebUI IP created in the previous step. Copy the full DNS names for the next step. + +Alternatively, run the following gcloud commands in a console window: + +```bash +gcloud dns --project= record-sets create publisher-manager.myzone.com. --zone="myzone" --type="A" --ttl="300" --rrdatas= +gcloud dns --project= record-sets create publisher-webui.myzone.com. --zone="myzone" --type="A" --ttl="300" --rrdatas= +``` +Replace ``, ``, `` with the appropriate values and use your own zone and DNS name instead of *myzone*. + +**3. Create a Google-managed SSL certificate** + +With your working project selected on the GCP Console, navigate to Network services → Load balancing. The default page view doesn’t enable you to edit certificates, so scroll to the bottom of the page and click the “load balancing link components view” to switch the view to display the load balancing resources. Select the CERTIFICATES tab and click on CREATE SSL CERTIFICATE. Create a Google-managed certificate named *publisher-manager-cert* using the Manager DNS name (*publisher-manager.myzone.com*) in the Domain field. Repeat this procedure for the *publisher-webui-cert* using the WebUI DNS name (*publisher-webui.myzone.com*) in the Domain field. + +Alternatively, run the following gcloud commands in a console window: + +```bash +gcloud beta compute ssl-certificates create publisher-manager-cert --project= --global --domains=publisher-manager.myzone.com +gcloud beta compute ssl-certificates create publisher-webui-cert --project= --global --domains=publisher-webui.myzone.com +``` +Replace ` with the name of your project and use the previously created DNS record values for the domain values. + +## Authentication and encryption +Select Postgres database credentials (username and password) you wish to use. The password must contain at least nine characters, of which there are two uppercase letters, two lowercase letters, and two numbers. + +Generate a 32B Encryption key using a random key generator, or use the default one provided in the deployment file, for messages used by the Worker component (ENC_KEY_1). + +Generate a 16B JWT encryption key for secure communication, or use the default one provided in the deployment file (JWT_SECRET). + +{{}} + +{{< tab "Azure-Kafka" >}} +### Azure-Kafka + +Deploy all of the required Publisher components to the Azure Cloud and publish messages to the Kafka broker. + +Log in to the Azure CLI and Pulumi will automatically use your credentials: +```bash +$ az login +``` + +### Configure your stack +You can use a stack configuration template file to quickly deploy and modify common architectures. This repository includes a set of pre-configured templates for different combinations of Dataphos components and cloud providers.Configuration specifics can be found in the Configuration section of this manual. + +To start using a stack template, copy the desired file from the config_templates directory into the project root directory. Next, create a new stack to contain your infrastructure configuration. Make sure to use the name of a a pre-configured stack template for your stack. + +```bash +$ pulumi stack init publisher-azure-kafka-dev +``` +This will create a new stack named `publisher-azure-kafka-dev` in your project and set it as the active stack. + + +## HTTPS setup + +Publisher uses HTTPS for two of its components (Web UI and Manager). To handle HTTPS, two external components must be deployed to the cluster. One to enable external traffic to the cluster, and the other to ensure the external traffic uses HTTPS protocol. Installation is done using public images. + +## Helm + +The deployment of required prerequisites is done over Helm. Helm is the package manager for Kubernetes. Helm can be installed using Chocolatey (Windows) and Apt (Debian/Ubuntu). For other operating systems check the official documentation. + +On Windows: + +```yaml +choco install kubernetes-helm +``` + +On Debian/Ubuntu: + +```bash +curl https://baltocdn.com/helm/signing.asc | gpg --dearmor | sudo tee /usr/share/keyrings/helm.gpg > /dev/null +sudo apt-get install apt-transport-https --yes +echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/helm.gpg] https://baltocdn.com/helm/stable/debian/ all main" | sudo tee /etc/apt/sources.list.d/helm-stable-debian.list +sudo apt-get update +sudo apt-get install helm +``` + +## Nginx Ingress Controller + +To manage external traffic to the cluster, an Ingress Controller solution should be provided. We opted for the Nginx Ingress Controller. + +Install the Nginx Ingress Controller: + +```bash +helm install ingress-nginx ingress-nginx/ingress-nginx --create-namespace --namespace ingress-basic --set controller.service.annotations."service\.beta\.kubernetes\.io/azure-load-balancer-health-probe-request-path"=/healthz +``` + +Generate TLS key and TLS certificate which will be used in secrets (you can use the use v3.conf file from [here](https://github.com/dataphos/dataphos-docs/tree/main/examples/publisher/v3.conf)). + +```bash +openssl req -newkey rsa:2048 -nodes -keyout tls.key -out tls.csr -config v3.conf + +openssl x509 -req -in tls.csr -signkey tls.key -out tls.crt -days 365 -extensions v3_req -extfile v3.conf +``` + + +## DNS records + +To use the Web UI and Manager components, DNS records need to be created for Ingress Controllers public IP address. + +Extract the Ingress Controller public IP address. + +```bash +kubectl get services nginx-ingress-ingress-nginx-controller --namespace ingress-basic \ + --output jsonpath='{.status.loadBalancer.ingress[0].ip}' +/ +``` +On your domain provider, add two A record sets for the extracted IP address, one for the Manager component and one for the Web UI component. + +In case your organization does not own a registered domain, we recommend GoDaddy as the domain provider. + +## Authentication and encryption + +Select Postgres database credentials (username and password) you wish to use. The password must contain at least nine characters, of which there are two uppercase letters, two lowercase letters, and two numbers. + +Generate a 32B Encryption key using a random key generator, or use the default one provided in the deployment file, for messages used by the Worker component (ENC_KEY_1). + +Generate a 16B JWT encryption key for secure communication, or use the default one provided in the deployment file (JWT_SECRET). + +{{}} +{{< tab "Azure-ServiceBus" >}} +### Azure-ServiceBus + +Deploy all of the required Publisher components to the Azure Cloud and publish messages to the ServiceBus. + +Log in to the Azure CLI and Pulumi will automatically use your credentials: +```bash +$ az login +``` + +### Configure your stack +You can use a stack configuration template file to quickly deploy and modify common architectures. This repository includes a set of pre-configured templates for different combinations of Dataphos components and cloud providers.Configuration specifics can be found in the Configuration section of this manual. + +To start using a stack template, copy the desired file from the config_templates directory into the project root directory. Next, create a new stack to contain your infrastructure configuration. Make sure to use the name of a a pre-configured stack template for your stack. + +```bash +$ pulumi stack init publisher-azure-sb-dev +``` +This will create a new stack named `publisher-azure-sb-dev` in your project and set it as the active stack. + +## HTTPS setup + +Publisher uses HTTPS for two of its components (Web UI and Manager). To handle HTTPS, two external components must be deployed to the cluster. One to enable external traffic to the cluster, and the other to ensure the external traffic uses HTTPS protocol. Installation is done using public images. + +## Helm + +The deployment of required prerequisites is done over Helm. Helm is the package manager for Kubernetes. Helm can be installed using Chocolatey (Windows) and Apt (Debian/Ubuntu). For other operating systems check the official documentation. + + +On Windows: + +```yaml +choco install kubernetes-helm +``` + +On Debian/Ubuntu: + +```bash +curl https://baltocdn.com/helm/signing.asc | gpg --dearmor | sudo tee /usr/share/keyrings/helm.gpg > /dev/null +sudo apt-get install apt-transport-https --yes +echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/helm.gpg] https://baltocdn.com/helm/stable/debian/ all main" | sudo tee /etc/apt/sources.list.d/helm-stable-debian.list +sudo apt-get update +sudo apt-get install helm +``` + +## Nginx Ingress Controller + +To manage external traffic to the cluster, an Ingress Controller solution should be provided. We opted for the Nginx Ingress Controller. + +Install the Nginx Ingress Controller: + +```bash +helm install ingress-nginx ingress-nginx/ingress-nginx --create-namespace --namespace ingress-basic --set controller.service.annotations."service\.beta\.kubernetes\.io/azure-load-balancer-health-probe-request-path"=/healthz +``` + +Generate TLS key and TLS certificate which will be used in secrets (you can use the use v3.conf file from [here](https://github.com/dataphos/dataphos-docs/tree/master/examples/publisher/v3.conf)). + +```bash +openssl req -newkey rsa:2048 -nodes -keyout tls.key -out tls.csr -config v3.conf + +openssl x509 -req -in tls.csr -signkey tls.key -out tls.crt -days 365 -extensions v3_req -extfile v3.conf +``` + + +## DNS records + +To use the Web UI and Manager components, DNS records need to be created for Ingress Controllers public IP address. + +Extract the Ingress Controller public IP address. + +```bash +kubectl get services nginx-ingress-ingress-nginx-controller --namespace ingress-basic \ + --output jsonpath='{.status.loadBalancer.ingress[0].ip}' +/ +``` +On your domain provider, add two A record sets for the extracted IP address, one for the Manager component and one for the Web UI component. + +In case your organization does not own a registered domain, we recommend GoDaddy as the domain provider. + +## Authentication and encryption + +Select the Postgres database credentials (username and password) you wish to use. The password must contain at least nine characters, of which there are two uppercase letters, two lowercase letters, and two numbers. + +Generate a 32B Encryption key using a random key generator, or use the default one provided in the deployment file, for messages used by the Worker component (ENC_KEY_1). + +Generate a 16B JWT encryption key for secure communication, or use the default one provided in the deployment file (JWT_SECRET). + +{{}} +{{}} + +### Deployment + +Preview and deploy infrastructure changes: +```bash +$ pulumi up +``` +Destroy your infrastructure changes: +```bash +$ pulumi destroy +``` + +## Start the Publisher Web UI + +Following the deployment, you can connect to the Publisher via its WebUI. + +To login use the admin username `publisher_admin` with the password `Adm!n`. + +To start a Publisher instance, Publisher configuration files should be added through the Web CLI. + +Access the Web UI by its public IP address and open the Web CLI tab. + +To get the Web UI IP address run the following command. + +```bash +kubectl get services publisher-webui --namespace dataphos \ + --output jsonpath='{.status.loadBalancer.ingress[0].ip}' +/ +``` + +To access the Web UI, paste the Web UI IP address in your web browser and specify port 8080, e.g. `http://1.1.1.1:8080`. + +### Starting a Publisher Instance section + +First, the source configuration should be created. The source database will be accessed by its public IP address. + +To get the source database IP address run the following command. + +```bash +kubectl get services publisher-postgres-source --namespace publisher-source \ + --output jsonpath='{.status.loadBalancer.ingress[0].ip}' +/ +``` + +In the UI, navigate to the **WebCLI** tab and load the following YAML file as a **Source**. + +Add the following source configuration for the Publisher to connect to the "invoices" database we created. + +```yaml +sourceName: publisher-postgres-source +sourceType: Postgres +host: +port: 5432 +databaseName: invoices +username: demo_user +password: demo_password +``` + +Still within the **WebCLI** tab, load the following YAML file as a **Destination**. + +Add the following destination configuration for your Pub/Sub topic. Keys with brackets should be replaced with your values. + +```yaml +destinationName: publisher-pubsub-destination +destinationType: PubSub +parameters: + ProjectID: + TopicID: +``` + +Finally, load the following instance configuration that ingests data from the invoice source, forms business objects according to the definition, and publishes messages to Pub/Sub. + +```yaml +publisherName: publisher-demo +sourceName: publisher-postgres-source +destinationName: publisher-pubsub-destination +serializationType: Avro +encryptionEnabled: false +businessObject: + description: "Demo Publisher - invoices for client" + objectGroup: "invoices-client" + additionalMetadata: + organization: Syntio + definition: + - client_info: + - client_id + - client_username + - client_location + - invoice_info: + - invoice_id + - creation_date + - due_date + - fully_paid_date + - invoice_items: + - invoice_item_id + - quantity + - total_cost + groupElements: + - client_id + - invoice_id + arrayElements: + - invoice_items + keyElements: + - client_id + - invoice_id + idElements: + - client_id + query: | + SELECT + invoice_id, + client_id, + client_username, + client_location, + creation_date, + due_date, + fully_paid_date, + invoice_item_id, + quantity, + total_cost, + billing_item_id + FROM demo_invoices + WHERE + creation_date >= to_timestamp({{ .FetchFrom }},'YYYY-MM-DD HH24:MI:SS') + AND + creation_date < to_timestamp({{ .FetchTo }}, 'YYYY-MM-DD HH24:MI:SS'); +fetcherConfig: + fetchingThreadsNO: 3 + queryIncrementType: HOUR + queryIncrementValue: 12 + initialFetchValue: 2020-01-01 00:20:00.000 + useNativeDriver: true +``` diff --git a/dataphos-docs/content/publisher/quickstart/shell.md b/dataphos-docs/content/publisher/quickstart/shell.md new file mode 100644 index 0000000..f10fbf8 --- /dev/null +++ b/dataphos-docs/content/publisher/quickstart/shell.md @@ -0,0 +1,329 @@ +--- +title: "Shell" +draft: false +weight: 1 +--- + +## Setting up your environment + +### Prerequisites + +Publishers components run in a Kubernetes environment. This quickstart guide will assume that you have a running Kubernetes cluster on one of the following cloud providers (GCP, Azure). +The Kubernetes cluster node/s should have at least 8 GB of available RAM. + +Publisher has multiple publishing destination options. This quickstart guide will assume that you want to publish data to **GCP Pub/Sub**, and that you have created a Pub/Sub topic and +a service account JSON key with the appropriate role (Pub/Sub Publisher) to do so. + +Note: if you wish to use Docker or any other Kubernetes instead of GCP or Azure, use this [guide]({{< ref "../configuration/_index.md#reference_anykubernetes">}}) for deployment. + + +### Set Up the example source database + +The Publisher has multiple data source options. This quickstart guide will use a mock Postgres database with mock invoice data as a data ingestion source. The database will be deployed as a Kubernetes StatefulSet resource on +a separate namespace ("publisher-source") of the Kubernetes cluster. + +**Deploy and initialize** the Postgres database when applying the following [YAML file](/referenced-scripts/YAML-examples/publisher/#publisher-postgresql-deployment): + +```bash +kubectl apply -f postgres_deployment.yaml +``` +The Postgres image, `syntioinc/dataphos-publisher-source-example:v0.0.2`, contains SQL scripts that create the "demo_invoices" table and insert invoice records upon creation. + +The database credentials are defined within the "publisher-postgres-source-secret" Kubernetes secret in the provided YAML. + +The "invoices" database can be accessed using a database client with "demo_user" as the username and "demo_password" as the password. + + +### Cloud provider configuration +{{< tabs "platformconfig" >}} +{{< tab "Azure" >}} +## HTTPS setup + +Publisher uses HTTPS for two of its components (Web UI and Manager). To handle HTTPS, two external components must be deployed to the cluster. One to enable external traffic to the cluster, and the other to ensure the external traffic uses HTTPS protocol. Installation is done using public images. + +## Helm + +The deployment of required prerequisites is done over Helm. Helm is the package manager for Kubernetes. Helm can be installed using Chocolatey (Windows) and Apt (Debian/Ubuntu). For other operating systems check the official documentation. + +On Windows: + +```yaml +choco install kubernetes-helm +``` + +On Debian/Ubuntu: + +```bash +curl https://baltocdn.com/helm/signing.asc | gpg --dearmor | sudo tee /usr/share/keyrings/helm.gpg > /dev/null +sudo apt-get install apt-transport-https --yes +echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/helm.gpg] https://baltocdn.com/helm/stable/debian/ all main" | sudo tee /etc/apt/sources.list.d/helm-stable-debian.list +sudo apt-get update +sudo apt-get install helm +``` + +## Nginx Ingress Controller + +To manage external traffic to the cluster, an Ingress Controller solution should be provided. We opted for the Nginx Ingress Controller. + +Install the Nginx Ingress Controller: + +```bash +helm install ingress-nginx ingress-nginx/ingress-nginx --create-namespace --namespace ingress-basic --set controller.service.annotations."service\.beta\.kubernetes\.io/azure-load-balancer-health-probe-request-path"=/healthz +``` + +Generate TLS key and TLS certificate which will be used in secrets (you can use the use v3.conf file from [here](/referenced-scripts/YAML-examples/publisher/#v3-config)). + +```bash +openssl req -newkey rsa:2048 -nodes -keyout tls.key -out tls.csr -config v3.conf + +openssl x509 -req -in tls.csr -signkey tls.key -out tls.crt -days 365 -extensions v3_req -extfile v3.conf +``` + + +## DNS records + +To use the Web UI and Manager components, DNS records need to be created for Ingress Controllers public IP address. + +Extract the Ingress Controller public IP address. + +```bash +kubectl get services nginx-ingress-ingress-nginx-controller --namespace ingress-basic \ + --output jsonpath='{.status.loadBalancer.ingress[0].ip}' +/ +``` +On your domain provider, add two A record sets for the extracted IP address, one for the Manager component and one for the Web UI component. + +In case your organization does not own a registered domain, we recommend GoDaddy as the domain provider + +{{}} +{{< tab "GCP" >}} + +## HTTPS setup + +Publisher uses HTTPS for two of its components (Web UI and Manager). It requires a predefined static IP address, domain name and certificate for both components inside the GCP project where Publisher is deployed for them to work properly. + +**1. Reserve static external IP addresses** + +With your working project selected on the GCP Console, navigate to VPC Network → IP addresses and click on the RESERVE EXTERNAL STATIC ADDRESS button in the top header. Create a Global IPv4 address named *publisher-manager-ip*. Repeat this procedure to create another address named *publisher-webui-ip*. Copy these IP addresses to use them in the next step. + +Alternatively, run the following gcloud commands in a console window: + +```bash +gcloud compute addresses create publisher-manager-ip --project= --global +gcloud compute addresses create publisher-webui-ip --project= --global +gcloud compute addresses describe publisher-manager-ip --project= --global +gcloud compute addresses describe publisher-webui-ip --project= --global +``` +Replace ** with the name of your project. + + +**2. Create DNS records** + +To create DNS records, you need to have a domain registered with a domain provider. If you don’t have a domain, you can register one with a domain registrar, e.g. GoDaddy. +We will use Cloud DNS to manage DNS records in your domain. If you choose to skip this step, you need to create these records directly in your domain hosting service. + +Navigate to Network services → Cloud DNS and create a new public zone or select an existing one in any project. We will use *myzone.com* as the assumed DNS name in the rest of these instructions. Click on the NS type record created with the zone. Copy the four nameserver names, e.g. ns-cloud-x.googledomains.com. Go to your domain registrar and replace your default nameservers with the ones copied from the created Cloud DNS zone. After the change is propagated, all records created in the Cloud DNS zone will be created in the domain registrar. + +Click on the ADD RECORD SET button in the Zone details page of the managed zone that you want to add the record to. Create a DNS record with the *publisher-manager* subdomain in the DNS name field, using the Manager IP address you created previously. Repeat this procedure for the *publisher-webui* DNS record using the WebUI IP created in the previous step. Copy the full DNS names for the next step. + +Alternatively, run the following gcloud commands in a console window: + +```bash +gcloud dns --project= record-sets create publisher-manager.myzone.com. --zone="myzone" --type="A" --ttl="300" --rrdatas= +gcloud dns --project= record-sets create publisher-webui.myzone.com. --zone="myzone" --type="A" --ttl="300" --rrdatas= +``` +Replace ``, ``, `` with the appropriate values and use your own zone and DNS name instead of *myzone*. + +**3. Create a Google-managed SSL certificate** + +With your working project selected on the GCP Console, navigate to Network services → Load balancing. The default page view doesn’t enable you to edit certificates, so scroll to the bottom of the page and click the “load balancing link components view” to switch the view to display the load balancing resources. Select the CERTIFICATES tab and click on CREATE SSL CERTIFICATE. Create a Google-managed certificate named *publisher-manager-cert* using the Manager DNS name (*publisher-manager.myzone.com*) in the Domain field. Repeat this procedure for the *publisher-webui-cert* using the WebUI DNS name (*publisher-webui.myzone.com*) in the Domain field. + +Alternatively, run the following gcloud commands in a console window: + +```bash +gcloud beta compute ssl-certificates create publisher-manager-cert --project= --global --domains=publisher-manager.myzone.com +gcloud beta compute ssl-certificates create publisher-webui-cert --project= --global --domains=publisher-webui.myzone.com +``` +Replace ` with the name of your project and use the previously created DNS record values for the domain values. +{{}} +{{}} + +### Create the Publisher namespace + +Create the namespace where the Publisher will be deployed. We will use namespace `dataphos` in this quickstart guide. + +```bash +kubectl create namespace dataphos +``` + +### Create the GCP Service Account Kubernetes secret + +Publisher requires that the GCP Service Account key for a service account with the appropriate role is added as a Kubernetes secret on the cluster. + +Having assumed you have previously created the GCP service account, **download** the JSON containing the service account information. Position yourself in the directory containing your Pub/Sub key file and create the Kubernetes secret using the following command. + +```bash +kubectl create secret generic pubsub-key --from-file=key.json=.json -n dataphos +``` + +### Authentication and encryption + +Select the Postgres database credentials (username and password) you wish to use. The password must contain at least nine characters, of which there are two uppercase letters, two lowercase letters, and two numbers. + +Generate a 32B Encryption key using a random key generator, or use the default one provided in the deployment file, for messages used by the Worker component (ENC_KEY_1). + +Generate a 16B JWT encryption key for secure communication, or use the default one provided in the deployment file (JWT_SECRET). + +### Deploy Publisher + +The script to deploy the publisher can be found [here](/referenced-scripts/deployment-scripts/publisher/#dataphos-publisher) (both the Bash and Powershell versions). The required arguments are: + +* The namespace +* The Publisher Metadata database username +* The Publisher Metadata database password + +To run the script, run the following command: + +{{< tabs "deployscripts" >}} +{{< tab "Bash" >}} +```bash +# "dataphos" is an example of the namespace name +# "syntioUser" is an example of the metadata Postgres username +# "p4sSw0rD" is an example of the metadata Postgres password +./publisher.sh -n dataphos -u syntioUser -p p4sSw0rD +``` +{{< /tabs >}} +{{< tab "Powershell" >}} +```bash +# "dataphos" is an example of the namespace name +# "syntioUser" is an example of the metadata Postgres username +# "p4sSw0rD" is an example of the metadata Postgres password +.\publisher.ps1 dataphos syntioUser p4sSw0rD +``` +{{< /tabs >}} +{{< /tabs >}} + +This will create all of the required components on your cluster. + +Note that this page will deploy the WebUI and Manager services as LoadBalancer, not as Ingress components. The deployment steps for using ingress are described on the [Usage](/publisher/usage) page. + +## Start the Publisher Web UI + +Following the deployment, you can connect to the Publisher via its WebUI. + +To login use the admin username `publisher_admin` with the password `Adm!n`. + +To start a Publisher instance, Publisher configuration files should be added through the Web CLI. + +Access the Web UI by its public IP address and open the Web CLI tab. + +To get the Web UI IP address run the following command. + +```bash +kubectl get services publisher-webui --namespace dataphos \ + --output jsonpath='{.status.loadBalancer.ingress[0].ip}' +/ +``` + +To access the Web UI, paste the Web UI IP address in your web browser and specify port 8080, e.g. `http://1.1.1.1:8080`. + +### Starting a Publisher Instance section + +First, the source configuration should be created. The source database will be accessed by its public IP address. + +To get the source database IP address run the following command. + +```bash +kubectl get services publisher-postgres-source --namespace publisher-source \ + --output jsonpath='{.status.loadBalancer.ingress[0].ip}' +/ +``` + +In the UI, navigate to the **WebCLI** tab and load the following YAML file as a **Source**. + +Add the following source configuration for the Publisher to connect to the "invoices" database we created. + +```yaml +sourceName: publisher-postgres-source +sourceType: Postgres +host: +port: 5432 +databaseName: invoices +username: demo_user +password: demo_password +``` + +Still within the **WebCLI** tab, load the following YAML file as a **Destination**. + +Add the following destination configuration for your Pub/Sub topic. Keys with brackets should be replaced with your values. + +```yaml +destinationName: publisher-pubsub-destination +destinationType: PubSub +parameters: + ProjectID: + TopicID: +``` + +Finally, load the following instance configuration that ingests data from the invoice source, forms business objects according to the definition, and publishes messages to Pub/Sub. + +```yaml +publisherName: publisher-demo +sourceName: publisher-postgres-source +destinationName: publisher-pubsub-destination +serializationType: Avro +encryptionEnabled: false +businessObject: + description: "Demo Publisher - invoices for client" + objectGroup: "invoices-client" + additionalMetadata: + organization: Syntio + definition: + - client_info: + - client_id + - client_username + - client_location + - invoice_info: + - invoice_id + - creation_date + - due_date + - fully_paid_date + - invoice_items: + - invoice_item_id + - quantity + - total_cost + groupElements: + - client_id + - invoice_id + arrayElements: + - invoice_items + keyElements: + - client_id + - invoice_id + idElements: + - client_id + query: | + SELECT + invoice_id, + client_id, + client_username, + client_location, + creation_date, + due_date, + fully_paid_date, + invoice_item_id, + quantity, + total_cost, + billing_item_id + FROM demo_invoices + WHERE + creation_date >= to_timestamp({{ .FetchFrom }},'YYYY-MM-DD HH24:MI:SS') + AND + creation_date < to_timestamp({{ .FetchTo }}, 'YYYY-MM-DD HH24:MI:SS'); +fetcherConfig: + fetchingThreadsNO: 3 + queryIncrementType: HOUR + queryIncrementValue: 12 + initialFetchValue: 2020-01-01 00:20:00.000 + useNativeDriver: true +``` diff --git a/dataphos-docs/content/publisher/usage.md b/dataphos-docs/content/publisher/usage.md new file mode 100644 index 0000000..80d71a7 --- /dev/null +++ b/dataphos-docs/content/publisher/usage.md @@ -0,0 +1,397 @@ +--- +title: "Usage" +draft: false +weight: 2 +--- + +This page contains all of the information required to further configure the Publisher and ensure proper communication between it and your external resources. + +# Kubernetes Environment + +### Enabling Connection to Different Message Brokers + +In order to enable Publisher to communicate with the service of your choice, you are required to create and deploy additional secrets in your Kubernetes environment, under the same namespace as Publisher. Google PubSub, for instance, requires the Service Account Key used to connect to your Cloud environment. Kafka, NATS and Pulsar require TLS secrets. + +**PubSub** + +| Field Name | Secret Name | Description | +|--- |--- |--- | +| `key.json` | pubsub-key | Base64-encoded JSON service account key | + +**Kafka** +| Field Name | Secret Name | Description | +|--- |--- |--- | +| `ca_crt.pem` | kafka-tls-credentials | Base64 encoded Kafka cluster CA TLS certificate | +| `client_crt.pem` | kafka-tls-credentials | Base64 encoded Kafka user TLS certificate | +| `client_key.pem` | kafka-tls-credentials | Base64 encoded Kafka user TLS private key | + + +**NATS** +| Field Name | Secret Name | Description | +|--- |--- |--- | +| `ca_crt.pem` | nats-tls-credentials | Base64 encoded Nats cluster CA TLS certificate | +| `client_crt.pem` | nats-tls-credentials | Base64 encoded Nats user TLS certificate | +| `client_key.pem` | nats-tls-credentials | Base64 encoded Nats user TLS private key | + +**Pulsar** +| Field Name | Secret Name | Description | +|--- |--- |--- | +| `ca_crt.pem` | pulsar-tls-credentials | Base64 encoded Pulsar cluster CA TLS certificate | +| `client_crt.pem` | pulsar-tls-credentials | Base64 encoded Pulsar user TLS certificate | +| `client_key.pem` | pulsar-tls-credentials | Base64 encoded Pulsar user TLS private key | + +When deployed, the Publisher Worker will automatically look for these secrets at the time of running the individual jobs. + +# Publisher Configuration Files + +Once you are logged into the **WebUI**, you can add your source, destination and instance YAML files. + +* **Source** defines the source from which Publisher will fetch the data. +* **Destination** defines to where Publisher will deliver the data (the message broker). +* **Instance** connects the **Source** and **Destination**, and defines the actual **publishing job**. Both the source and destination may be used in multiple different publishing jobs. + +## Source + +The Source configuration YAML file is used to define the details regarding the source database that the user wants to use as a source of data. + +{{< tabs "sources" >}} +{{< tab "Shared Source Configuration" >}} + +**Shared Source Configuration** + +| Environment variable name | Value Type | Description | Required | +|---|---|---|---| +|username |String | The username that the user has to provide so that the connection to the desired database can be made. | Yes | +|sourceType |String | Source type represents the type of data source. Can be one of supported database types or API source.Supported database types: Postgres, Oracle, MySql, Db2, SqlServer, Api, Mongo. | Yes | +|sourceName |String | Name of the source configuration. This value needs to be unique upon a single Publisher deployment. Specifying the existing source name when creating a new source resource will result in failure. Also, this name cannot be empty, this is the unique identifier of an existing and active source configuration in the database. | Yes | +|port |Integer | Database port has to be provided so that a connection to the correct source database can be made. | Yes | +|password |String | Database password for the provided username. | Yes | +|parameters* |String | Represents any parameters that a specific database might use for performance purposes. These parameters can be defined in the form of a key: value map. | No | +|host |String | In case of a database source, the database hostname has to be provided so that a connection to the correct source database can be made. In case of an API source, the base API URL has to be provided. E.g., http://exposed-api:3030. The rest of the source API path will be appended based on the query field in the business object. | Yes | +|databaseName | String | Database name that contains source data. | Yes | + +{{< /tab >}} +{{< tab "Oracle" >}} + +**Source** + +| Environment variable name | Value Type | Description | Optimal Value | Required | +|---|---|---|---|---| +|prefetch_rows | Integer | Number of rows to prefetch for each fetch request that results in a roundtrip to the Oracle server. | 1000 | No | +|prefetch_memory | Integer | Sets the memory allocated for rows to be prefetched. | 100 | No | + +{{< /tab >}} +{{< /tabs >}} + +### Source YAML file example (Postgres) + + +```yaml +sourceName: publisher-postgres-source +sourceType: Postgres +host: +port: 5432 +databaseName: invoices +username: demo_user +password: demo_password +``` + +## Destination + +The Destination configuration YAML file is used to define the details regarding the destination service that the user wants to use as a final destination of data. Every destination includes parameters from the *Destination* tab. Each destination type has specific parameters that have to be added in the YAML file. These parameters are specified in the other tabs. + +{{< tabs "Specific destination" >}} + +{{< tab "Shared Destination Configuration" >}} + +**Destination** + +| Environment variable name | Value Type | Description | Required | +|---|---|---|---| +|destinationName |String | Name of the destination configuration. This value needs to be unique upon a single Publisher deployment. Specifying the existing destination name when creating a new destination resource will result in failure. Also, this name cannot be empty, this is the unique identifier of an existing and active destination configuration in the database. |Yes | +|destinationType |String |Destination type represents the type of destination service that destination configuration will be using. Supported destination types: PubSub, Solace, Kafka, Azure (ServiceBus from version 0.5.1), NatsCore, NatsJetStream. | Yes | +|parameters* |Key-value map | The values specific for some messaging platforms can be defined in the form of a key: value map. |Yes | + {{< /tab >}} + +{{< tab "Google PubSub" >}} + +**Google PubSub** + +| Environment variable name | Value Type | Description | Required | +|--- |--- |--- |--- | +| ProjectID | String | Name of the Google Cloud Platform project that contains the topic where data will be sent to. | Yes | +| TopicID | String | Name of the Google Cloud Platform topic which will receive sent data. | Yes | +| ByteThreshold | Integer | Suggested value: 150000000 | No | +| CountThreshold | Integer | Suggested value: 400 | No | +| DelayThreshold | Integer | Suggested value: 10000000 | No | +| NumOfGoroutines | Integer | Suggested value: 20 | No | +| MaxOutStandingMessages | Integer | Suggested value: 800 | No | +| MaxOutStandingBytes | Integer | Suggested value: 1000 * 1024 * 1024 | No | +| EnableMessageOrdering | Boolean | Suggested value: false | No | + +{{< /tab >}} + +{{< tab "Kafka" >}} + +**Kafka** + +| Environment variable name | Value Type | Description | Required | +|--- |--- |--- |--- | +| BrokerAddr | String | List of Kafka broker endpoints separated by commas, or a single broker. Example: 10.0.42.206:9092 | Yes | +| TopicID | String | Name of the Kafka topic which will receive sent data. | Yes | +| BatchSize | Integer | BatchSize sets the max amount of records the client will buffer, blocking new produces until records are finished if this limit is reached. | No | +| BatchBytes | Integer (Int64) | BatchBytes when multiple records are sent to the same partition, the producer will batch them together. BatchBytes parameter controls the amount of memory in bytes that will be used for each batch. | No | +| Linger | Integer | Linger controls the amount of time to wait for additional messages before sending the current batch. | No | +| TLS | String | Whether encryption should be enabled. Possible values: true or false. Default value is false. | No | +| VerifyServerCertificate | Boolean | If set to true and TLS is enabled, ca_cert.pem file defined in the kafka-tls-credentials secret is used to authenticate the Kafka broker. Default value is false. | No | +| VerifyClientCertificate | Boolean | If set to true and TLS is enabled, client_crt.pem and client_key.pem file defined in the kafka-tls-credentials secret are used to authenticate the client on the broker. Default value is false. | No | + +{{< /tab >}} + +{{< tab "Solace" >}} + +**Solace** + +| Environment variable name | Value Type | Description | Required | +|--- |--- |--- |--- | +| BrokerURI | String | URI to connect to the broker. | Yes | +| TopicID | String | Name of the topic which will receive sent data. | Yes | +| Username | String | The username for the client. | Yes | +| Password | String | The password for the client. | Yes | +| Qos | Integer | Level of quality of service. Default value is 1. | No | +{{< /tab >}} + +{{< tab "Azure" >}} + +**Azure** + +| Environment variable name | Value Type | Description | Required | +|--- |--- |--- |--- | +| ConnectionString | String | A connection string includes the authorization information required to access data in an Azure Storage account at runtime using Shared Key authorization | Yes | +| TopicID | String | Name of the topic which will receive sent data. | Yes | +{{< /tab >}} + +{{< tab "Pulsar" >}} + +**Pulsar** + +| Environment variable name | Value Type | Description | Required | +|--- |--- |--- |--- | +| ServiceURL | String | URL to connect to the brokers. | Yes | +| TopicID | String | Name of the topic which will receive sent data. | Yes | +| TLS | String | Whether encryption should be enabled. Possible values: true or false. Default value is false. | No | +| VerifyServerCertificate | Boolean | If set to true and TLS is enabled, ca_cert.pem file defined in the pulsar-tls-credentials secret is used to authenticate the Pulsar broker. Default value is false. | No | +| VerifyClientCertificate | Boolean | If set to true and TLS is enabled, client_crt.pem and client_key.pem file defined in the pulsar-tls-credentials secret are used to authenticate the client on the broker. Default value is false. | No | +| ConnectionTimeout | Integer | ConnectionTimeout is the timeout for the establishment of a TCP connection in seconds. Default value is 5 seconds. | No | +| OperationTimeout | Integer | OperationTimeout is the timeout for creating the publisher. Default value is 30 seconds. | No | +| SendTimeout | Integer | SendTimeout is the timeout for a published message to be acknowledged by the broker. Default value is 30 seconds. | No | +| MaxConnectionsPerBroker | Integer | MaxConnectionsPerBroker is the max number of connections to a single broker that will be kept in the pool. Default value is 1. | No | +| DisableBlockIfQueueFull | Integer | DisableBlockIfQueueFull controls whether publishing blocks if producer's message queue is full. Default value is false. | No | +| MaxPendingMessages | Integer | MaxPendingMessages specifies the max size of the queue holding messages waiting an acknowledgment from the broker. Default value is 1. | No | +| MaxReconnectToBroker | Integer | MaxReconnectToBroker specifies the maximum retry number of reconnectToBroker. Default value is nil. This means the client retries forever. | No | + +{{< /tab >}} + +{{< tab "NATS">}} + +**NATS** + + +| Environment variable name | Value Type | Description | Required | +|--- |--- |--- |--- | +| URL | String | URL to connect to the brokers. | Yes | +| Subject | String | Name of the subject which will receive sent data. | Yes | +| MaxPending | Integer | MaxPending sets the maximum outstanding async publishes that can be inflight at one time. Default value is 512. | No | +| TLS | String | Whether encryption should be enabled. Possible values: true or false. Default value is false. | No | +| VerifyServerCertificate | Boolean | If set to true and TLS is enabled, ca_cert.pem file defined in the kafka-tls-credentials secret is used to authenticate the Kafka broker. Default value is false. | No | +| VerifyClientCertificate | Boolean | If set to true and TLS is enabled, client_crt.pem and client_key.pem file defined in the kafka-tls-credentials secret are used to authenticate the client on the broker. Default value is false. | No | + +{{< /tab >}} + +{{< /tabs >}} + +### Destination YAML file example (Google Pub/Sub) + + +```yaml +destinationName: publisher-pubsub-destination +destinationType: PubSub +parameters: + ProjectID: + TopicID: +``` + +## Publisher Instance + +The Instance configuration YAML file is used to define the job specifics. This is the central type of configuration and its creation and validation depend on the existing active **source** and **destination** configurations. The Instance configuration YAML file is fairly minimalistic in approach, but allows enough flexibility to fine-tune the requirements of your publishing job. + +Here, we define the query the data will be fetched with, how it will be formatted, serialized, encrypted and, finally, published. + +For ease of understanding, we will split the file into three distinct sections: + +* The **Instance** portion, defining how the source and destination should connect. +* The **Fetcher** portion, defining how the data should be pulled from the source. +* The **Business object** portion, defining how the data will be transformed directly prior to publishing. + + +{{< tabs "configfilesother" >}} +{{< tab "Instance configuration" >}} + +Instance configuration + +| Environment variable name | Value Type | Description | Required | +|---|---|---|---| +|publisherName |String |Name of the Publisher instance. This value needs to be unique upon a single Publisher deployment. Specifying the existing publisher name when creating a new Publisher instance will result in failure. Also, this value cannot be empty. | Yes | +|sourceName |String |Name of the existing source configuration that this Publisher instance will use for setting up the connection to source. Specifying a nonexistent source name will result in failure in the validation process. |Yes | +|destinationName |String | Name of the existing destination configuration that this Publisher instance will use for setting up destination details. Specifying a nonexistent destination name will result in failure in the validation process. | Yes | +|serializationType | String |Name of the supported serialization type to be used in Publisher instance. Users can define only one of the possible serialization types. Supported serializations: Avro, Json |Yes | +|encryptionEnabled |Boolean | A boolean flag indicates if encryption will be used in the Publisher instance. |No | +|encryptionType |String |If the flag above is set to true, it is expected of the user to specify the encryption type that will be used in the Publisher instance. Users can only input one type. Supported encryptions: Aes256 |No (Yes if encryptionEnabled is “Yes”) | +|encryptionKeyName |String |Refers to a name of a variable which the user is using to represent his encryption key, so the user does not input the key itself but only the name of that key. |No (Yes if encryptionEnabled is “Yes”) | +|scheduleInterval |String |A sleeping mechanism for the Publisher. Since Publisher supports scheduled running, this field represents how often the user wants Publisher to do a single run (fetching and publishing of data). This needs to be a cron-type expression. Meaning, the user must set the recurring interval in which Publisher will run (e.g. each minute of each hour */1 * * * *), meanwhile it waits. Default value: null. | Yes (only if fetchSkippedScheduledIntervals is true) | +|schema |String | Since one of the Publisher’s main components and features is serialization, a schema is required which the user can provide. If the user does not provide one, Publisher will generate one, use it and store it for future purposes. |No | +|scheduledStartTime | String | The field a user can use to schedule when Publisher will start its initial running. The expected format of the input value is the date of the wanted publisher start (e.g. 2022-01-01 00:00:00), until then Publisher will sleep. This is a bonus feature meaning that by default this value is set to null and that the publisher will start its process immediately upon successful creation of publisher configuration. | No | +|fetcherConfig |Key-value map |Used when the instance fetches from an API source. |Yes | +|useApiWorker |Boolean |Used when the instance fetches from an API source. |No | +|useApiJwtAuth |Boolean |Used when the API source requires JWT authentication to access data. |No | +{{< /tab >}} + + +{{< tab "Fetcher configuration" >}} + +Fetcher configuration + + +| Environment variable name | Value Type | Description | Required | +|---|---|---|---| +| useNativeDriver |Boolean |This boolean flag indicates if the user wants to use native Golang drivers for fetching which are faster but behave inconsistently or the implemented Java fetcher which is more stable but slower. |No | +| UseReflectTypeFetch |Boolean |Used when native Golang drivers are being used for fetching. If true, fetched data types will be mapped to the ones in the database and therefore will be slower. | No | +| ReturnCsv |Boolean |CSV format will be returned instead of JSON from the Java Fetcher. |No | +| initialFetchValue | Boolean | Defines the lower bound of the first, initial fetch period in the WHERE condition clause. For the first run ever, Publisher will do a greater-or-equal ( >= ) condition on it. |No | +| endFetch | String |Since queries in Publisher are executed on specific time intervals, this represents the date which when reached will stop the fetching process and therefore the Publisher itself. By default it is empty but the user can input the date he desires (e.g. 2010-01-01 00:00:00). | No | +|fetchingThreadsNO |Integer | The number of parallel threads that will split the query fetch period in order to speed-up row-fetching. |No | +|apiFetchFromParam | String |The query parameter in the API URL which represents the date from which the data is fetched from. E.g., with date_from apiFetchFromParam: http://exposed-api:3001?date_from=&... |Yes (only if useApiWorker is set to true) | +|apiFetchToParam |String |The query parameter in the API URL which represents the date until which the data is fetched to. E.g., with date_to apiFetchToParam: http://exposed-api:3001?...&date_to= |Yes (only if useApiWorker is set to true) | +|apiTimeLayout |String | The format of the timestamp value that the Publisher appends to apiFetchFromParam and apiFetchToParam. E.g., 2006-01-02 15:04:05. |Yes (only if useApiWorker is set to true) | +|apiQueryParams |String |Map of key-value pairs containing static query parameters for the API URL. | No | +|apiHeaderParams |String |Map of key-value pairs containing header information for the API request to fetch data. |No | +|apiJwtAuthPath |String |The path which will be appended to the host field in the source configuration. This URL will be used to generate the JWT. |Yes (only if useApiJwtAuth is set to true) | +|apiJwtAuthBody |String |Map of key-value pairs containing the body for the POST request to generate the JWT. |Yes (only if useApiJwtAuth is set to true) | +|apiJwtAuthHeaders |String |Map of key-value pairs containing header information for the API request to generate the JWT.|No | +|queryIncrementType queryIncrementValue |String Integer | The two parameters as a pair determine the upper bound of the fetch period. Publisher will automatically apply the strictly lower condition ( < ) and calculate the upper bound by using the formula: last successful fetched period + queryIncrementValue queryIncrementType , for example: 2010-03-01 00:00:00 + 1 year => < 2011-03-01 00:00:00 2010-03-01 00:00:00 + 3 month => < 2010-06-01 00:00:00 2010-03-01 00:00:00 + 5 minute => < 2010-03-01 00:05:00 Supported types for queryIncrementType : year, month, day, hour, minute, second | Yes | +|dataSeparatorChar |String | This value is taken into account when the Publisher is using Java Fetcher in his fetching process. Indicates the separating sign that will be used when separating fetched data. |No | +|fetchSkippedScheduledIntervals |Boolean | Used when the user wants the Publisher instance to catch up to current time if it stopped for some reason when it is running in scheduled mode. | No | +{{< /tab >}} + + +{{< tab "Business object" >}} + +Business object + + +| Environment variable name | Value Type | Description | Required | +|---|---|---|---| +|description |String |Used for describing and clarifying the business objects. |No | +|objectGroup |String |Can be filled out if this business object belongs to a certain predefined group of business objects. Used for easier searching. |No | +|additionalMetadata |Key-value map |Any additional metadata that the user wants to provide with the business object. In the form of a key: value map. |No | +|batchMode |String | The `batchMode` parameter controls how rows fetched from the database are grouped into batches. You can pass one of two values: `MaxRowCount` or `MaxEstimatedSizeInBytes`. **MaxRowCount**: Batches are filled up to a specified number of rows (`batchSize`). A batch may contain fewer rows if there is insufficient data remaining in the current run. **MaxEstimatedSizeInBytes**: Batches are filled up to a specified memory size in bytes (`batchSize`). A batch may contain fewer bytes if there is insufficient data remaining. If `groupElements` is defined, the batch of rows is grouped and formatted into a single message according to the rules specified in the `definition`. Otherwise, the batch of rows is returned as-is. |No | +|batchSize |Integer |The `batchSize` parameter configures the size of each batch, depending on the selected `batchMode`: **MaxRowCount**: `batchSize` specifies the number of rows in each batch. **MaxEstimatedSizeInBytes**: `batchSize` specifies the memory size of each batch in bytes. |Yes (only if batchMode is specified) | +|definition |String | Defines message format. Used for detailing business object structure and fields. Needs to be defined when groupElements are set (grouping is enabled). Database rows that are grouped by the groupElements values into a single business object are formatted into a single message by the definition. Important note: To prevent data loss, every column in the definition that is not the same for each grouped database row in the business object, should be in an array of values (arrayElements). |No | +|arrayElements |String | Definition elements that will be created and treated as arrays. Those elements need to contain non-repeating columns which can then be treated as array elements (each record that has the same grouping keys will have non-repeating values stored as a single element of array node). |No | +|groupElements |String |Column list which indicates the values which need to be the same for multiple fetched dataset rows in order for them to be grouped into a single message structure that will fit the specified definition (business object). If groupElements is not defined (grouping is disabled), the fetched database rows are returned as messages without additional formatting. In that case, the definition is not used. |No | +|keyElements |String |List of columns whose values will be used to create a single value. This value will be one out of the two values (id_elements) that can uniquely mark every record sent to some messaging platform. The values are concatenated with an underscore delimiter. The value extracted using `keyElements` is set as the key for the message sent to the broker. When `groupElements` is not defined (grouping is disabled), and batchMode is specified (batching is enabled), the key elements are extracted for the first and last row in the batch and concatenated with a semicolon delimiter. In that case, the recommended value for `keyElements` is the timestamp column used in the query (the rows are sorted by the timestamp) |No | +|idElements |String | List of columns whose values will be used to create a single value. This value will be one out of the two values (key_elements) that can uniquely mark every record sent to some messaging platform. The values are concatenated with an underscore delimiter. When groupElements are not defined (grouping is disabled), and batchMode is specified (batching is enabled), the key elements are extracted for the first and last row in the batch and concatenated with a semicolon delimiter. |No | +|query |String |Users must enter a valid SQL SELECT query (WITH CTE is also supported) used to fetch database rows. The query must select all columns that will be used in the business object definition. The query must have placeholders variables for a timestamp column used to fetch data in uniform timestamp intervals per Publisher run. Publisher will automatically replace the variables with actual timestamps calculated based on configuration. E.g., SELECT invoice_id, cost FROM invoices WHERE creation_date >= to_timestamp({{ .FetchFrom }}, 'YYYY-MM-DD HH24:MI:SS') AND creation_date < to_timestamp({{ .FetchTo }}, 'YYYY-MM-DD HH24:MI:SS'); |Yes | + +{{< /tab >}} +{{< /tabs >}} + +### Instance YAML file example (Postgres to Google Pub/Sub) + + +```yaml +publisherName: publisher-demo +sourceName: publisher-postgres-source +destinationName: publisher-pubsub-destination +serializationType: Avro +encryptionEnabled: false +businessObject: + description: "Demo Publisher - invoices for client" + objectGroup: "invoices-client" + additionalMetadata: + organization: Syntio + definition: + - client_info: + - client_id + - client_username + - client_location + - invoice_info: + - invoice_id + - creation_date + - due_date + - fully_paid_date + - invoice_items: + - invoice_item_id + - quantity + - total_cost + groupElements: + - client_id + - invoice_id + arrayElements: + - invoice_items + keyElements: + - client_id + - invoice_id + idElements: + - client_id + query: | + SELECT + invoice_id, + client_id, + client_username, + client_location, + creation_date, + due_date, + fully_paid_date, + invoice_item_id, + quantity, + total_cost, + billing_item_id + FROM demo_invoices + WHERE + creation_date >= to_timestamp({{ .FetchFrom }},'YYYY-MM-DD HH24:MI:SS') + AND + creation_date < to_timestamp({{ .FetchTo }}, 'YYYY-MM-DD HH24:MI:SS'); +fetcherConfig: + fetchingThreadsNO: 3 + queryIncrementType: HOUR + queryIncrementValue: 12 + initialFetchValue: 2020-01-01 00:20:00.000 + useNativeDriver: true +``` + +## Web UI usage + +Below is the screen you will be greeted with after first logging into the Publisher Web UI. + +![](/home.PNG) + +**Publisher runs** - statistical information about the execution of each instance. + +![](/runs_n.PNG) + +**Publisher Instances** - overview of Publisher’s instance configurations, providing the ability to start and stop instances as required. + +![](/instance_n.PNG) + +**Publisher Lineage** - data origin, what happens to it, and where it moves over time. + +![](/lineage.png) + +**Publisher queries** - testing Publisher instance configuration with verbose error messages. You can view the efficiency of query as well as a sample of the data obtained. + +![](/queries.PNG) + +**WEB CLI** - update and process YAML files to update sources, destinations, configuration. First, add source YAML, then destination and then instance. You can add, update and delete your configuration. + +![](/webcli.PNG) \ No newline at end of file diff --git a/dataphos-docs/content/publisher/videos-and-blogs.md b/dataphos-docs/content/publisher/videos-and-blogs.md new file mode 100644 index 0000000..e5e8573 --- /dev/null +++ b/dataphos-docs/content/publisher/videos-and-blogs.md @@ -0,0 +1,19 @@ +--- +title: "Videos and Blogs" +draft: false +weight: 5 +--- +## Blogs +[Publisher business blog](https://www.syntio.net/en/labs-musings/publisher-the-accelerator-to-your-decision-making-process?utm_source=DataphosDocs&utm_medium=Documentation&utm_campaign=DataphosDocs) + +[Publisher technical blog](https://www.syntio.net/en/labs-musings/publisher-a-technical-overview-of-our-cutting-edge-solution?utm_source=DataphosDocs&utm_medium=Documentation&utm_campaign=DataphosDocs) + +## Videos + +Publisher Overview and Demo + +[![Publisher - A Data Platform component by Syntio - Showcase](/publisher_thumbnail_overview.jpg)](https://youtu.be/zTEXAnJhBkw "Publisher - A Data Platform component by Syntio - Showcase") + +Publisher Deployment Guide + +[![Publisher - A Data Platform component by Syntio - Deployment Guide](/publisher_thumbnail_deployment.jpg)](https://youtu.be/tyz2zwUbIbE "Publisher - A Data Platform component by Syntio - Deployment Guide") diff --git a/dataphos-docs/content/publisher/what-is-the-publisher.md b/dataphos-docs/content/publisher/what-is-the-publisher.md new file mode 100644 index 0000000..b02c9d6 --- /dev/null +++ b/dataphos-docs/content/publisher/what-is-the-publisher.md @@ -0,0 +1,129 @@ +--- +title: "Overview" +draft: false +weight: 1 +--- + +![](/publisher.png) + +**Publisher** is a component developed for running a constant flow of ready-to-digest data packets across your Cloud infrastructure -- sourced directly from a database or from an exposed API. Whereas most Change Data Capture (CDC) solutions focus on capturing and transmitting changes on individual tables, **Publisher** focuses on *forming* **well-defined, structured objects** directly at the source, *publishing* them to a **distributed message broker** of your choice and allowing *a multitude of consumers* to **independently consume and process the data in their own individual pipelines**. + +Collectively, this is known as a **create-transform-publish** pattern. The structured objects generated by the Publisher are what we refer to as **business objects** -- you are no longer pushing technical data on changes in your database, but curated, key business information. + +As a user, you define the query and **how** the result of that query should be assembled into a business object with a minimalist and easy to understand YAML configuration. **Publisher** takes care of the rest -- assembling the data, serializing it, encrypting it and -- finally -- publishing it. +You start with a query: + +```plaintext +SELECT + t.transaction_id, + t.transaction_time, + t.amount, + t.user_id, + u.user_name, + u.user_country, +FROM + transactions t + JOIN user u ON t.user_id = u.user_id +WHERE + -- Publisher syntax for defining the time increments in which data should be captured. + -- Publisher will automatically replace the rules here with actual timestamps, + -- calculated based on configuration. + transaction_time >= to_timestamp({{ .FetchFrom }}, 'YYYY-MM-DD HH24:MI:SS') + AND + transaction_time < to_timestamp({{ .FetchTo }}, 'YYYY-MM-DD HH24:MI:SS'); +``` +Publisher will automatically replace the placeholder variables with actual timestamps calculated based on configuration. + +Our goal is to publish information on all transactions, aggregated by **individual users**. + +We would therefore format the results of this query as: + +```yaml +definition: # Define general structure. + - user: + - user_id + - user_name + - user_country + - transactions: + - transaction_id + - transaction_time + - amount +groupElements: # Define what defines a unique group. + - user_id +arrayElements: # Specify what is repeatable object. + - transactions +idElements: # Information to use on the unique identifier the message will be sent with. + - user_id +``` + +Serialized into JSON, this would result in the following data being published to your streaming platform: + +```json +[ + { + "user": { + "user_id": "some_user", + "user_name": "Some Name", + "user_country": "Croatia" + }, + "transactions": [ + { + "transaction_id": "...", + "transaction_time": "...", + "amount": 1.5 + }, + { + "transaction_id": "...", + "transaction_time": "...", + "amount": 5.3 + }, + // ... + ] + }, + { + "user": { + "user_id": "some_other_user", + "user_name": "Some Other Name", + "user_country": "France" + }, + "transactions": [ + { + "transaction_id": "...", + "transaction_time": "...", + "amount": 5 + }, + // ... + ] + }, +] +``` + +(You as a user may determine if you wish to have multiple user records in a message or have each message be its own singular record.) + +## Publisher Components + +Publisher comes as a set of microservices. A single Publisher deployment is designed to enable multiple independent publishing jobs, configured and managed via a simple user interface. + +### Manager + +The Manager component is a REST server that exposes API endpoints used for configuration management. All configuration information (the source you will be pulling data from, the destination you will be publishing data to and how the data should be fetched and formatted) is stored in a small metadata database which is managed by the Publisher. + +### Scheduler + +Once you define the configuration of a Publishing job, the Scheduler component initializes a **Worker** responsible for running the job itself. A **Kubernetes Pod** is created for each active Publisher job configuration. The Scheduler destroys the pods once the job is completed or an error state is reached. This is what makes Publisher a true Kubernetes-native solution: dynamic and scalable. + +The Publisher can either publish data **constantly** or publish increments of data on a given cron-based schedule (for more information on this, please view the [Usage](/publisher/usage) page). + +During its periodical checking of Kubernetes pods status, the Scheduler component ensures that all the pods that should be active are, in fact active. If a pod breaks and crashes for some reason, the Scheduler will create it again if it is supposed to be running. + +### Worker + +One Worker component is created for each active Publisher configuration. There can be multiple active workers with different configurations simultaneously. + +Once a Worker is created, it processes data in a loop until a previously defined stopping point, until it’s stopped by the user, or until an error has occurred. + +### Web UI + +The Web UI component is a visual tool used to apply the configuration files of the individual Publisher jobs and monitor the performance of Publisher instances. + +![](/home.PNG) diff --git a/dataphos-docs/content/referenced scripts/YAML examples/_index.md b/dataphos-docs/content/referenced scripts/YAML examples/_index.md new file mode 100644 index 0000000..035ea08 --- /dev/null +++ b/dataphos-docs/content/referenced scripts/YAML examples/_index.md @@ -0,0 +1,7 @@ +--- +title: "YAML Examples" +draft: false +weight: 3 +geekdocCollapseSection: true +--- +{{< toc-tree >}} \ No newline at end of file diff --git a/dataphos-docs/content/referenced scripts/YAML examples/persistor.md b/dataphos-docs/content/referenced scripts/YAML examples/persistor.md new file mode 100644 index 0000000..fb3ac79 --- /dev/null +++ b/dataphos-docs/content/referenced scripts/YAML examples/persistor.md @@ -0,0 +1,1284 @@ +--- +title: "Persistor Examples" +draft: false +--- + +# Dataphos Persistor + +## Persistor GCP +{{< details "YAML example" >}} +``` +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: mongo + namespace: dataphos +spec: + selector: + matchLabels: + role: mongo + serviceName: mongo-service + template: + metadata: + labels: + role: mongo + spec: + terminationGracePeriodSeconds: 10 + containers: + - name: mongo + image: mongo:4.0 + command: + - mongod + - "--bind_ip" + - 0.0.0.0 + - "--smallfiles" + - "--noprealloc" + ports: + - containerPort: 27017 + volumeMounts: + - name: mongo-persistent-volume + mountPath: /data/db + volumeClaimTemplates: + - metadata: + name: mongo-persistent-volume + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 1Gi +--- +apiVersion: v1 +kind: Service +metadata: + name: mongo-service + namespace: dataphos + labels: + name: mongo +spec: + ports: + - port: 27017 + targetPort: 27017 + clusterIP: None + selector: + role: mongo +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: pes-config + namespace: dataphos +data: + READER_TYPE: "pubsub" + SENDER_TYPE: "pubsub" + INDEXERENABLED: "true" + DEADLETTERENABLED: "true" + READER_PUBSUB_PROJECTID: "" # change this + READER_PUBSUB_SUBID: "" # change this + STORAGE_TYPE: "gcs" + STORAGE_PREFIX: "msg" + STORAGE_MSGEXTENSION: "avro" + STORAGE_MASK: "year/month/day/hour" + STORAGE_CUSTOMVALUES: "" + STORAGE_DESTINATION: "" # change this + STORAGE_TOPICID: "" # change this + SENDER_TOPICID: "" # change this + SENDER_DEADLETTERTOPIC: "" # change this + SENDER_PUBSUB_PROJECTID: "" # change this + BATCHSETTINGS_BATCHSIZE: "5000" + BATCHSETTINGS_BATCHTIMEOUT: "30s" + BATCHSETTINGS_BATCHMEMORY: "1000000" + MINIMUM_LOG_LEVEL: "INFO" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: persistor + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: persistor + template: + metadata: + labels: + app: persistor + spec: + volumes: + - name: google-cloud-key + secret: + secretName: per-gcp-access + containers: + - name: gcp-persistor + image: syntioinc/dataphos-persistor-core:1.0.0 + volumeMounts: + - mountPath: /var/secrets/google + name: google-cloud-key + ports: + - containerPort: 2112 + envFrom: + - configMapRef: + name: pes-config + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /var/secrets/google/key.json +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: idx-config + namespace: dataphos +data: + READER_TYPE: "pubsub" + SENDER_TYPE: "pubsub" + DEADLETTERENABLED: "true" + + READER_PUBSUB_PROJECTID: "" # change this + READER_PUBSUB_SUBID: "" # change this + + MONGO_CONNECTIONSTRING: "mongodb://mongo-0.mongo-service.dataphos:27017" + MONGO_DATABASE: "indexer_db" + MONGO_COLLECTION: "indexer_collection" + + SENDER_DEADLETTERTOPIC: "" # change this + SENDER_PUBSUB_PROJECTID: "" # change this + + BATCHSETTINGS_BATCHSIZE: "5000" + BATCHSETTINGS_BATCHTIMEOUT: "30s" + BATCHSETTINGS_BATCHMEMORY: "1000000" + MINIMUM_LOG_LEVEL: "INFO" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: indexer + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: indexer + template: + metadata: + labels: + app: indexer + spec: + volumes: + - name: google-cloud-key + secret: + secretName: per-gcp-access + containers: + - name: indexer + image: syntioinc/dataphos-persistor-indexer:1.0.0 + volumeMounts: + - mountPath: /var/secrets/google + name: google-cloud-key + ports: + - containerPort: 2112 + envFrom: + - configMapRef: + name: idx-config + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /var/secrets/google/key.json +--- +apiVersion: v1 +kind: Service +metadata: + name: persistor-metrics-svc + namespace: dataphos + labels: + app: persistor +spec: + type: LoadBalancer + ports: + - port: 2112 + selector: + app: persistor +--- +apiVersion: v1 +kind: Service +metadata: + name: indexer-metrics-svc + namespace: dataphos + labels: + app: indexer +spec: + type: LoadBalancer + ports: + - port: 2112 + selector: + app: indexer +--- +apiVersion: v1 +kind: Service +metadata: + name: indexer-api-svc + namespace: dataphos + labels: + app: indexer-api +spec: + type: LoadBalancer + ports: + - port: 8080 + selector: + app: indexer-api +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: idx-api-config + namespace: dataphos +data: + CONN: "mongodb://mongo-0.mongo-service.dataphos:27017" + DATABASE: "indexer_db" + MINIMUM_LOG_LEVEL: "WARN" + SERVER_ADDRESS: ":8080" + USE_TLS: "false" + SERVER_TIMEOUT: "10s" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: indexer-api-deployment + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: indexer-api + template: + metadata: + labels: + app: indexer-api + spec: + containers: + - name: indexer-api + image: syntioinc/dataphos-persistor-indexer-api:1.0.0 + envFrom: + - configMapRef: + name: idx-api-config +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: rsb-config + namespace: dataphos +data: + PUBSUB_PROJECT_ID: "" # change this + INDEXER_URL: "http://indexer-api-svc:8080" + MINIMUM_LOG_LEVEL: "WARN" + SERVER_ADDRESS: ":8081" + USE_TLS: "false" + SERVER_TIMEOUT: "10s" + RSB_META_CAPACITY: "20000" + RSB_FETCH_CAPACITY: "200" + RSB_WORKER_NUM: "3" + RSB_ENABLE_MESSAGE_ORDERING: "false" + STORAGE_TYPE: "gcs" # Do not change! + PUBLISHER_TYPE: "pubsub" # Do not change! + PUBLISH_TIMEOUT: "15s" + PUBLISH_COUNT_THRESHOLD: "50" + PUBLISH_DELAY_THRESHOLD: "50ms" + NUM_PUBLISH_GOROUTINES: "10" + MAX_PUBLISH_OUTSTANDING_MESSAGES: "800" + MAX_PUBLISH_OUTSTANDING_BYTES: "1048576000" + PUBLISH_ENABLE_MESSAGE_ORDERING: "false" +--- +apiVersion: v1 +kind: Service +metadata: + name: resubmitter-svc + namespace: dataphos + labels: + app: resubmitter +spec: + type: LoadBalancer + ports: + - port: 8081 + selector: + app: resubmitter +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: resubmitter-deployment + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: resubmitter + template: + metadata: + labels: + app: resubmitter + spec: + volumes: + - name: google-cloud-key + secret: + secretName: per-gcp-access + containers: + - name: resubmitter + image: syntioinc/dataphos-persistor-resubmitter:1.0.0 + volumeMounts: + - mountPath: /var/secrets/google + name: google-cloud-key + envFrom: + - configMapRef: + name: rsb-config + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /var/secrets/google/key.json +``` + +{{< /details >}} + +## Persistor Azure +{{< details "YAML example" >}} + +``` +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: mongo + namespace: dataphos +spec: + selector: + matchLabels: + role: mongo + serviceName: mongo-service + template: + metadata: + labels: + role: mongo + spec: + terminationGracePeriodSeconds: 10 + containers: + - name: mongo + image: mongo:4.0 + command: + - mongod + - "--bind_ip" + - 0.0.0.0 + - "--smallfiles" + - "--noprealloc" + ports: + - containerPort: 27017 + volumeMounts: + - name: mongo-persistent-volume + mountPath: /data/db + volumeClaimTemplates: + - metadata: + name: mongo-persistent-volume + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 1Gi +--- +apiVersion: v1 +kind: Service +metadata: + name: mongo-service + namespace: dataphos + labels: + name: mongo +spec: + ports: + - port: 27017 + targetPort: 27017 + clusterIP: None + selector: + role: mongo +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: pes-config + namespace: dataphos +data: + READER_TYPE: "servicebus" + SENDER_TYPE: "servicebus" + INDEXERENABLED: "true" + DEADLETTERENABLED: "true" + READER_SERVICEBUS_CONNECTIONSTRING: "" # change this + READER_SERVICEBUS_TOPICID: "" # change this - must be equal to STORAGE_TOPICID + READER_SERVICEBUS_SUBID: "" # change this + STORAGE_TYPE: "abs" + STORAGE_PREFIX: "msg" + STORAGE_MSGEXTENSION: "avro" + STORAGE_MASK: "year/month/day/hour" + STORAGE_CUSTOMVALUES: "" + STORAGE_DESTINATION: "" # change this + STORAGE_TOPICID: "" # change this + STORAGE_STORAGEACCOUNTID: "" # change this + SENDER_TOPICID: "" # change this + SENDER_DEADLETTERTOPIC: "" # change this + SENDER_SERVICEBUS_CONNECTIONSTRING: "" # change this + BATCHSETTINGS_BATCHSIZE: "5000" + BATCHSETTINGS_BATCHTIMEOUT: "30s" + BATCHSETTINGS_BATCHMEMORY: "1000000" + AZURE_CLIENT_ID: "" # change this + AZURE_TENANT_ID: "" # change this + AZURE_CLIENT_SECRET: "" # change this + MINIMUM_LOG_LEVEL: "INFO" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: persistor + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: persistor + template: + metadata: + labels: + app: persistor + spec: + containers: + - name: azure-persistor + image: syntioinc/dataphos-persistor-core:1.0.0 + ports: + - containerPort: 2112 + envFrom: + - configMapRef: + name: pes-config +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: idx-config + namespace: dataphos +data: + READER_TYPE: "servicebus" + SENDER_TYPE: "servicebus" + DEADLETTERENABLED: "true" + READER_SERVICEBUS_CONNECTIONSTRING: "" # change this + READER_SERVICEBUS_TOPICID: "" # change this + READER_SERVICEBUS_SUBID: "" # change this + SENDER_DEADLETTERTOPIC: "" # change this + SENDER_SERVICEBUS_CONNECTIONSTRING: "" # change this + MONGO_CONNECTIONSTRING: "mongodb://mongo-0.mongo-service.dataphos:27017" + MONGO_DATABASE: "indexer_db" + MONGO_COLLECTION: "indexer_collection" + BATCHSETTINGS_BATCHSIZE: "5000" + BATCHSETTINGS_BATCHTIMEOUT: "30s" + BATCHSETTINGS_BATCHMEMORY: "1000000" + MINIMUM_LOG_LEVEL: "INFO" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: indexer + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: indexer + template: + metadata: + labels: + app: indexer + spec: + containers: + - name: indexer + image: syntioinc/dataphos-persistor-indexer:1.0.0 + ports: + - containerPort: 2112 + envFrom: + - configMapRef: + name: idx-config +--- +apiVersion: v1 +kind: Service +metadata: + name: persistor-metrics-svc + namespace: dataphos + labels: + app: persistor +spec: + type: LoadBalancer + ports: + - port: 2112 + selector: + app: persistor +--- +apiVersion: v1 +kind: Service +metadata: + name: indexer-metrics-svc + namespace: dataphos + labels: + app: indexer +spec: + type: LoadBalancer + ports: + - port: 2112 + selector: + app: indexer +--- +apiVersion: v1 +kind: Service +metadata: + name: indexer-api-svc + namespace: dataphos + labels: + app: indexer-api +spec: + type: LoadBalancer + ports: + - port: 8080 + selector: + app: indexer-api +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: idx-api-config + namespace: dataphos +data: + CONN: "mongodb://mongo-0.mongo-service.dataphos:27017" + DATABASE: "indexer_db" + MINIMUM_LOG_LEVEL: "INFO" + SERVER_ADDRESS: ":8080" + USE_TLS: "false" + SERVER_TIMEOUT: "2s" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: indexer-api-deployment + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: indexer-api + template: + metadata: + labels: + app: indexer-api + spec: + containers: + - name: indexer-api + image: syntioinc/dataphos-persistor-indexer-api:1.0.0 + envFrom: + - configMapRef: + name: idx-api-config +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: rsb-config + namespace: dataphos +data: + INDEXER_URL: http://indexer-api-svc:8080 + AZURE_CLIENT_ID: "" # change this + AZURE_TENANT_ID: "" # change this + AZURE_CLIENT_SECRET: "" # change this + SB_CONNECTION_STRING: "" # change this + AZURE_STORAGE_ACCOUNT_NAME: "" # change this + MINIMUM_LOG_LEVEL: "INFO" + STORAGE_TYPE: "abs" # Do not change! + PUBLISHER_TYPE: "servicebus" # Do not change! + SERVER_ADDRESS: ":8081" + USE_TLS: "false" + SERVER_TIMEOUT: "2s" + RSB_META_CAPACITY: "20000" + RSB_FETCH_CAPACITY: "200" + RSB_WORKER_NUM: "3" + RSB_ENABLE_MESSAGE_ORDERING: "false" +--- +apiVersion: v1 +kind: Service +metadata: + name: resubmitter-svc + namespace: dataphos + labels: + app: resubmitter +spec: + type: LoadBalancer + ports: + - port: 8081 + selector: + app: resubmitter +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: resubmitter-deployment + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: resubmitter + template: + metadata: + labels: + app: resubmitter + spec: + containers: + - name: resubmitter + image: syntioinc/dataphos-persistor-resubmitter:1.0.0 + envFrom: + - configMapRef: + name: rsb-config + +``` + +## Persistor Kafka into Azure Blob Storage +``` +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: mongo + namespace: dataphos +spec: + selector: + matchLabels: + role: mongo + serviceName: mongo-service + template: + metadata: + labels: + role: mongo + spec: + terminationGracePeriodSeconds: 10 + containers: + - name: mongo + image: mongo:4.0 + command: + - mongod + - "--bind_ip" + - 0.0.0.0 + - "--smallfiles" + - "--noprealloc" + ports: + - containerPort: 27017 + volumeMounts: + - name: mongo-persistent-volume + mountPath: /data/db + volumeClaimTemplates: + - metadata: + name: mongo-persistent-volume + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 1Gi +--- +apiVersion: v1 +kind: Service +metadata: + name: mongo-service + namespace: dataphos + labels: + name: mongo +spec: + ports: + - port: 27017 + targetPort: 27017 + clusterIP: None + selector: + role: mongo +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: pes-config + namespace: dataphos +data: + READER_TYPE: "kafka" + SENDER_TYPE: "kafka" + INDEXERENABLED: "true" + DEADLETTERENABLED: "true" + READER_KAFKA_TOPICID: "" # change this - must be equal to STORAGE_TOPICID + READER_KAFKA_ADDRESS: "" # change this + READER_KAFKA_GROUPID: "" # change this + STORAGE_TYPE: "abs" + STORAGE_PREFIX: "msg" + STORAGE_MSGEXTENSION: "avro" + STORAGE_MASK: "year/month/day/hour" + STORAGE_CUSTOMVALUES: "" + STORAGE_DESTINATION: "" # change this + STORAGE_TOPICID: "" # change this + STORAGE_STORAGEACCOUNTID: "" # change this + SENDER_TOPICID: "" # change this + SENDER_DEADLETTERTOPIC: "" # change this + SENDER_KAFKA_ADDRESS: "" # change this + BATCHSETTINGS_BATCHSIZE: "5000" + BATCHSETTINGS_BATCHTIMEOUT: "30s" + BATCHSETTINGS_BATCHMEMORY: "1000000" + AZURE_CLIENT_ID: "" # change this + AZURE_TENANT_ID: "" # change this + AZURE_CLIENT_SECRET: "" # change this + MINIMUM_LOG_LEVEL: "INFO" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: persistor + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: persistor + template: + metadata: + labels: + app: persistor + spec: + containers: + - name: azure-persistor + image: syntioinc/dataphos-persistor-core:1.0.0 + ports: + - containerPort: 2112 + envFrom: + - configMapRef: + name: pes-config +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: idx-config + namespace: dataphos +data: + READER_TYPE: "kafka" + SENDER_TYPE: "kafka" + DEADLETTERENABLED: "true" + READER_KAFKA_TOPICID: "" # change this + READER_KAFKA_ADDRESS: "" # change this + READER_KAFKA_GROUPID: "" # change this + SENDER_DEADLETTERTOPIC: "" # change this + MONGO_CONNECTIONSTRING: "mongodb://mongo-0.mongo-service.dataphos:27017" + MONGO_DATABASE: "indexer_db" + MONGO_COLLECTION: "indexer_collection" + SENDER_KAFKA_ADDRESS: "" # change this + BATCHSETTINGS_BATCHSIZE: "5000" + BATCHSETTINGS_BATCHTIMEOUT: "30s" + BATCHSETTINGS_BATCHMEMORY: "1000000" + MINIMUM_LOG_LEVEL: "INFO" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: indexer + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: indexer + template: + metadata: + labels: + app: indexer + spec: + containers: + - name: indexer + image: syntioinc/dataphos-persistor-indexer:1.0.0 + ports: + - containerPort: 2112 + envFrom: + - configMapRef: + name: idx-config +--- +apiVersion: v1 +kind: Service +metadata: + name: persistor-metrics-svc + namespace: dataphos + labels: + app: persistor +spec: + type: LoadBalancer + ports: + - port: 2112 + selector: + app: persistor +--- +apiVersion: v1 +kind: Service +metadata: + name: indexer-metrics-svc + namespace: dataphos + labels: + app: indexer +spec: + type: LoadBalancer + ports: + - port: 2112 + selector: + app: indexer +--- +apiVersion: v1 +kind: Service +metadata: + name: indexer-api-svc + namespace: dataphos + labels: + app: indexer-api +spec: + type: LoadBalancer + ports: + - port: 8080 + selector: + app: indexer-api +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: idx-api-config + namespace: dataphos +data: + CONN: "mongodb://mongo-0.mongo-service.dataphos:27017" + DATABASE: "indexer_db" + MINIMUM_LOG_LEVEL: "INFO" + SERVER_ADDRESS: ":8080" + USE_TLS: "false" + SERVER_TIMEOUT: "2s" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: indexer-api-deployment + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: indexer-api + template: + metadata: + labels: + app: indexer-api + spec: + containers: + - name: indexer-api + image: syntioinc/dataphos-persistor-indexer-api:1.0.0 + envFrom: + - configMapRef: + name: idx-api-config +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: rsb-config + namespace: dataphos +data: + MINIMUM_LOG_LEVEL: "INFO" + INDEXER_URL: http://indexer-api-svc:8080 + STORAGE_TYPE: "abs" # Do not change! + PUBLISHER_TYPE: "kafka" # Do not change! + SERVER_ADDRESS: ":8081" + USE_TLS: "false" + SERVER_TIMEOUT: "10s" + RSB_META_CAPACITY: "20000" + RSB_FETCH_CAPACITY: "200" + RSB_WORKER_NUM: "3" + RSB_ENABLE_MESSAGE_ORDERING: "false" + AZURE_CLIENT_ID: "" # change this + AZURE_TENANT_ID: "" # change this + AZURE_CLIENT_SECRET: "" # change this + AZURE_STORAGE_ACCOUNT_NAME: "" # change this + KAFKA_BROKERS: "" # change this + KAFKA_USE_TLS: "false" + KAFKA_USE_SASL: "false" + SASL_USERNAME: "default" + SASL_PASSWORD: "default" + KAFKA_SKIP_VERIFY: "false" + KAFKA_DISABLE_COMPRESSION: "false" + KAFKA_BATCH_SIZE: "50" + KAFKA_BATCH_BYTES: "52428800" + KAFKA_BATCH_TIMEOUT: "10ms" + ENABLE_KERBEROS: "false" + KRB_CONFIG_PATH: "/path/to/config/file" + KRB_REALM: "REALM.com" + KRB_SERVICE_NAME: "kerberos-service" + KRB_KEY_TAB: "/path/to/file.keytab" + KRB_USERNAME: "user" +--- +apiVersion: v1 +kind: Service +metadata: + name: resubmitter-svc + namespace: dataphos + labels: + app: resubmitter +spec: + type: LoadBalancer + ports: + - port: 8081 + selector: + app: resubmitter +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: resubmitter-deployment + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: resubmitter + template: + metadata: + labels: + app: resubmitter + spec: + containers: + - name: resubmitter + image: syntioinc/dataphos-persistor-resubmitter:1.0.0 + envFrom: + - configMapRef: + name: rsb-config + + +``` +{{< /details >}} + +## Persistor Kafka into GCS +{{< details "YAML example" >}} + +``` +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: mongo + namespace: dataphos +spec: + selector: + matchLabels: + role: mongo + serviceName: mongo-service + template: + metadata: + labels: + role: mongo + spec: + terminationGracePeriodSeconds: 10 + containers: + - name: mongo + image: mongo:4.0 + command: + - mongod + - "--bind_ip" + - 0.0.0.0 + - "--smallfiles" + - "--noprealloc" + ports: + - containerPort: 27017 + volumeMounts: + - name: mongo-persistent-volume + mountPath: /data/db + volumeClaimTemplates: + - metadata: + name: mongo-persistent-volume + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 1Gi +--- +apiVersion: v1 +kind: Service +metadata: + name: mongo-service + namespace: dataphos + labels: + name: mongo +spec: + ports: + - port: 27017 + targetPort: 27017 + clusterIP: None + selector: + role: mongo +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: pes-config + namespace: dataphos +data: + READER_TYPE: "kafka" + SENDER_TYPE: "kafka" + INDEXERENABLED: "true" + DEADLETTERENABLED: "true" + READER_KAFKA_TOPICID: "" # change this - must be equal to STORAGE_TOPICID + READER_KAFKA_ADDRESS: "" # change this + READER_KAFKA_GROUPID: "" # change this + STORAGE_TYPE: "gcs" + STORAGE_PREFIX: "msg" + STORAGE_MSGEXTENSION: "avro" + STORAGE_MASK: "year/month/day/hour" + STORAGE_CUSTOMVALUES: "" + STORAGE_DESTINATION: "" # change this + STORAGE_TOPICID: "" # change this + SENDER_TOPICID: "" # change this + SENDER_DEADLETTERTOPIC: "" # change this + SENDER_KAFKA_ADDRESS: "" # change this + BATCHSETTINGS_BATCHSIZE: "5000" + BATCHSETTINGS_BATCHTIMEOUT: "30s" + BATCHSETTINGS_BATCHMEMORY: "1000000" + MINIMUM_LOG_LEVEL: "INFO" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: persistor + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: persistor + template: + metadata: + labels: + app: persistor + spec: + volumes: + - name: google-cloud-key + secret: + secretName: per-gcp-access + containers: + - name: gcp-persistor + image: syntioinc/dataphos-persistor-core:1.0.0 + volumeMounts: + - mountPath: /var/secrets/google + name: google-cloud-key + ports: + - containerPort: 2112 + envFrom: + - configMapRef: + name: pes-config + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /var/secrets/google/key.json + imagePullSecrets: + - name: nexuscred +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: idx-config + namespace: dataphos +data: + READER_TYPE: "kafka" + SENDER_TYPE: "kafka" + DEADLETTERENABLED: "true" + READER_KAFKA_TOPICID: "" # change this + READER_KAFKA_ADDRESS: "" # change this + READER_KAFKA_GROUPID: "" # change this + SENDER_DEADLETTERTOPIC: "" # change this + MONGO_CONNECTIONSTRING: "mongodb://mongo-0.mongo-service.dataphos:27017" + MONGO_DATABASE: "indexer_db" + MONGO_COLLECTION: "indexer_collection" + SENDER_KAFKA_ADDRESS: "" # change this + BATCHSETTINGS_BATCHSIZE: "5000" + BATCHSETTINGS_BATCHTIMEOUT: "30s" + BATCHSETTINGS_BATCHMEMORY: "1000000" + MINIMUM_LOG_LEVEL: "INFO" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: indexer + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: indexer + template: + metadata: + labels: + app: indexer + spec: + volumes: + - name: google-cloud-key + secret: + secretName: per-gcp-access + containers: + - name: indexer + image: syntioinc/dataphos-persistor-indexer:1.0.0 + volumeMounts: + - mountPath: /var/secrets/google + name: google-cloud-key + ports: + - containerPort: 2112 + envFrom: + - configMapRef: + name: idx-config + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /var/secrets/google/key.json +--- +apiVersion: v1 +kind: Service +metadata: + name: persistor-metrics-svc + namespace: dataphos + labels: + app: persistor +spec: + type: LoadBalancer + ports: + - port: 2112 + selector: + app: persistor +--- +apiVersion: v1 +kind: Service +metadata: + name: indexer-metrics-svc + namespace: dataphos + labels: + app: indexer +spec: + type: LoadBalancer + ports: + - port: 2112 + selector: + app: indexer +--- +apiVersion: v1 +kind: Service +metadata: + name: indexer-api-svc + namespace: dataphos + labels: + app: indexer-api +spec: + type: LoadBalancer + ports: + - port: 8080 + selector: + app: indexer-api +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: idx-api-config + namespace: dataphos +data: + CONN: "mongodb://mongo-0.mongo-service.dataphos:27017" + DATABASE: "indexer_db" + MINIMUM_LOG_LEVEL: "INFO" + SERVER_ADDRESS: ":8080" + USE_TLS: "false" + SERVER_TIMEOUT: "10s" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: indexer-api-deployment + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: indexer-api + template: + metadata: + labels: + app: indexer-api + spec: + containers: + - name: indexer-api + image: syntioinc/dataphos-persistor-indexer-api:1.0.0 + envFrom: + - configMapRef: + name: idx-api-config +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: rsb-config + namespace: dataphos +data: + PUBSUB_PROJECT_ID: "" # change this + INDEXER_URL: "http://indexer-api-svc:8080" + MINIMUM_LOG_LEVEL: "INFO" + SERVER_ADDRESS: ":8081" + USE_TLS: "false" + SERVER_TIMEOUT: "10s" + RSB_META_CAPACITY: "20000" + RSB_FETCH_CAPACITY: "200" + RSB_WORKER_NUM: "3" + RSB_ENABLE_MESSAGE_ORDERING: "false" + STORAGE_TYPE: "gcs" # Do not change! + PUBLISHER_TYPE: "kafka" # Do not change! + KAFKA_BROKERS: "" # change this + KAFKA_USE_TLS: "false" + KAFKA_USE_SASL: "false" + SASL_USERNAME: "default" + SASL_PASSWORD: "default" + KAFKA_SKIP_VERIFY: "false" + KAFKA_DISABLE_COMPRESSION: "false" + KAFKA_BATCH_SIZE: "50" + KAFKA_BATCH_BYTES: "52428800" + KAFKA_BATCH_TIMEOUT: "10ms" + ENABLE_KERBEROS: "false" + KRB_CONFIG_PATH: "/path/to/config/file" + KRB_REALM: "REALM.com" + KRB_SERVICE_NAME: "kerberos-service" + KRB_KEY_TAB: "/path/to/file.keytab" + KRB_USERNAME: "user" +--- +apiVersion: v1 +kind: Service +metadata: + name: resubmitter-svc + namespace: dataphos + labels: + app: resubmitter +spec: + type: LoadBalancer + ports: + - port: 8081 + selector: + app: resubmitter +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: resubmitter-deployment + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: resubmitter + template: + metadata: + labels: + app: resubmitter + spec: + volumes: + - name: google-cloud-key + secret: + secretName: per-gcp-access + containers: + - name: resubmitter + image: syntioinc/dataphos-persistor-resubmitter:1.0.0 + volumeMounts: + - mountPath: /var/secrets/google + name: google-cloud-key + envFrom: + - configMapRef: + name: rsb-config + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /var/secrets/google/key.json +``` +{{< /details >}} diff --git a/dataphos-docs/content/referenced scripts/YAML examples/publisher.md b/dataphos-docs/content/referenced scripts/YAML examples/publisher.md new file mode 100644 index 0000000..4296f5c --- /dev/null +++ b/dataphos-docs/content/referenced scripts/YAML examples/publisher.md @@ -0,0 +1,1265 @@ +--- +title: "Publisher Examples" +draft: true +weight: 3 +--- + +# Dataphos Publisher + +## Publisher Ingress +{{< details "YAML example" >}} + +``` +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: publisher-webui-ingress + namespace: dataphos + annotations: + kubernetes.io/ingress.class : nginx + nginx.ingress.kubernetes.io/ssl-redirect : "true" + nginx.ingress.kubernetes.io/enable-cors : "true" + nginx.ingress.kubernetes.io/cors-allow-methods : "PUT, GET, POST, DELETE, OPTIONS" + nginx.ingress.kubernetes.io/cors-allow-origin : "*" + nginx.ingress.kubernetes.io/azure-load-balancer-health-probe-request-path: /healthz +spec: + rules: + - host: # insert your WEB UI domain name, same as in the Manager config map + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: publisher-webui + port: + number: 8080 + tls: + - hosts: + - # insert your WEB UI domain name + secretName: webui-tls-secret +``` +{{< /details >}} + +## Publisher PostgreSQL Deployment +{{< details "YAML example" >}} + +``` +apiVersion: v1 +kind: Namespace +metadata: + name: publisher-source +--- + +apiVersion: v1 +kind: ConfigMap +metadata: + name: publisher-postgres-source-config + namespace: publisher-source +data: + PGDATA: /var/lib/postgresql/data/pgdata +--- + +apiVersion: v1 +kind: Secret +metadata: + name: publisher-postgres-source-secret + namespace: publisher-source +type: Opaque +stringData: + POSTGRES_DB: invoices + POSTGRES_USER: demo_user + POSTGRES_PASSWORD: demo_password +--- + +apiVersion: v1 +kind: Service +metadata: + name: publisher-postgres-source + namespace: publisher-source +spec: + selector: + app: publisher-postgres-source-db + ports: + - port: 5432 + type: LoadBalancer +--- + +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: publisher-postgres-source-db + namespace: publisher-source +spec: + serviceName: publisher-postgres-source + replicas: 1 + selector: + matchLabels: + app: publisher-postgres-source-db + template: + metadata: + labels: + app: publisher-postgres-source-db + spec: + containers: + - name: publisher-postgres-source + image: syntioinc/dataphos-publisher-source-example:1.0.0 + ports: + - containerPort: 5432 + envFrom: + - configMapRef: + name: publisher-postgres-source-config + - secretRef: + name: publisher-postgres-source-secret + volumeMounts: + - name: publisher-postgres-source-data-volume + mountPath: /var/lib/postgresql/data + volumeClaimTemplates: + - metadata: + name: publisher-postgres-source-data-volume + namespace: publisher-source + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 500M +``` +{{< /details >}} + +## Publisher k8s +{{< details "YAML example" >}} + +``` +# Postgres metadata database +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-postgres-config + namespace: dataphos +data: + PGDATA: /var/lib/postgresql/data/pgdata +--- + +apiVersion: v1 +kind: Secret +metadata: + name: publisher-postgres-secret + namespace: dataphos +type: Opaque +stringData: + POSTGRES_DB: publisher # insert your database name, same as METADATA_DATABASE in configuration.yaml + POSTGRES_USER: publisher # insert your database username, same as METADATA_USERNAME in configuration.yaml + POSTGRES_PASSWORD: samplePassworD1212 # insert your database user password, same as METADATA_PASSWORD in configuration.yaml +--- + + +# Common configuration +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-metadata-config + namespace: dataphos +data: + METADATA_HOST: publisher-postgres.dataphos.svc + METADATA_PORT: "5432" + METADATA_DATABASE: publisher_metadata +--- + +apiVersion: v1 +kind: Secret +metadata: + name: publisher-metadata-secret + namespace: dataphos +type: Opaque +stringData: + METADATA_USERNAME: publisher # insert your database username + METADATA_PASSWORD: samplePassworD1212 # insert your database user password +--- + +# optional secret +apiVersion: v1 +kind: Secret +metadata: + name: kafka-tls-credentials + namespace: dataphos +type: Opaque +data: + "ca_crt.pem": "" # insert your base64 encoded Kafka cluster CA TLS certificate, leave empty if not needed (optional) + "client_crt.pem": "" # insert your base64 encoded Kafka user TLS certificate, leave empty if not needed (optional) + "client_key.pem": "" # insert your base64 encoded Kafka user TLS private key, leave empty if not needed (optional) +--- + +# optional secret +apiVersion: v1 +kind: Secret +metadata: + name: nats-tls-credentials + namespace: dataphos +type: Opaque +data: + "ca_crt.pem": "" # insert your base64 encoded Nats cluster CA TLS certificate, leave empty if not needed (optional) + "client_crt.pem": "" # insert your base64 encoded Nats user TLS certificate, leave empty if not needed (optional) + "client_key.pem": "" # insert your base64 encoded Nats user TLS private key, leave empty if not needed (optional) +--- + +# optional secret +apiVersion: v1 +kind: Secret +metadata: + name: pulsar-tls-credentials + namespace: dataphos +type: Opaque +data: + "ca_crt.pem": "" # insert your base64 encoded Nats cluster CA TLS certificate, leave empty if not needed (optional) + "client_crt.pem": "" # insert your base64 encoded Nats user TLS certificate, leave empty if not needed (optional) + "client_key.pem": "" # insert your base64 encoded Nats user TLS private key, leave empty if not needed (optional) +--- + +# optional secret +apiVersion: v1 +kind: Secret +metadata: + name: pubsub-key + namespace: dataphos +type: Opaque +data: + "key.json": "" # insert your base64 encoded Pub/Sub service account key, leave empty if publishing to Pub/Sub + # not needed (optional) +--- + +apiVersion: v1 +kind: Secret +metadata: + name: encryption-keys + namespace: dataphos +type: Opaque +stringData: # insert your encryption keys, one or more + "keys.yaml": | + ENC_KEY_1: "D2C0B5865AE141A49816F1FDC110FA5A" +--- +# Manager +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-manager-config + namespace: dataphos +data: + WEB_UI: # insert your webui domain name + FETCHER_URL: http://publisher-data-fetcher:8081 +--- + +apiVersion: v1 +kind: Secret +metadata: + name: publisher-manager-secret + namespace: dataphos +type: Opaque +stringData: + JWT_SECRET: SuperSecretPass! # insert your JWT secret key, 16 characters +--- + +# Data Fetcher +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-data-fetcher-config + namespace: dataphos +data: + MANAGER_URL: http://publisher-manager:8080 +--- + +# Scheduler +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-scheduler-config + namespace: dataphos +data: + WORKER_IMAGE: syntioinc/dataphos-publisher-worker:1.0.0 + FETCHER_URL: http://publisher-data-fetcher:8081 + SCHEMA_GENERATOR_URL: http://publisher-avro-schema-generator:8080 + SCHEMA_VALIDATION_URL: http:/ # insert the schema registry public URL or 0.0.0.0 if schema registry is not deployed + IMAGE_PULL_SECRET: regcred + KUBERNETES_NAMESPACE: dataphos + SECRET_NAME_PUBSUB: pubsub-key + SECRET_NAME_KAFKA: kafka-tls-credentials + SECRET_NAME_NATS: nats-tls-credentials + SECRET_NAME_PULSAR: pulsar-tls-credentials +--- + +# WebUI +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-webui-config + namespace: dataphos +data: + "server.properties": | + window.MANAGER_ENDPOINT = "/backend" +--- + +apiVersion: v1 +kind: Service +metadata: + name: publisher-postgres + namespace: dataphos +spec: + selector: + app: publisher-postgres-db + ports: + - port: 5432 +--- + +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: publisher-postgres-db + namespace: dataphos +spec: + serviceName: publisher-postgres + replicas: 1 + selector: + matchLabels: + app: publisher-postgres-db + template: + metadata: + labels: + app: publisher-postgres-db + spec: + containers: + - name: publisher-postgres + image: postgres:latest + ports: + - containerPort: 5432 + envFrom: + - configMapRef: + name: publisher-postgres-config + - secretRef: + name: publisher-postgres-secret + volumeMounts: + - name: publisher-postgres-volume + mountPath: /var/lib/postgresql/data + volumeClaimTemplates: + - metadata: + name: publisher-postgres-volume + namespace: publisher + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 20Gi +--- + +# Initialize metadata database +apiVersion: batch/v1 +kind: Job +metadata: + name: publisher-initdb + namespace: dataphos +spec: + template: + spec: + containers: + - name: initdb + image: syntioinc/dataphos-publisher-initdb:1.0.0 + ports: + - containerPort: 5432 + envFrom: + - configMapRef: + name: publisher-metadata-config + - secretRef: + name: publisher-metadata-secret + restartPolicy: OnFailure + backoffLimit: 15 +--- + + +# Avro Schema Generator +apiVersion: v1 +kind: Service +metadata: + name: publisher-avro-schema-generator + namespace: dataphos +spec: + selector: + app: server + component: avro-schema-generator + ports: + - protocol: TCP + port: 8080 +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-avro-schema-generator + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: server + component: avro-schema-generator + template: + metadata: + labels: + app: server + component: avro-schema-generator + annotations: + syntio.net/logme: "true" + spec: + containers: + - name: avro-schema-generator + image: syntioinc/dataphos-publisher-avro-schema-generator:1.0.0 + resources: + limits: + cpu: 500m + requests: + cpu: 50m + memory: 250Mi +--- + +apiVersion: v1 +kind: Service +metadata: + name: publisher-manager + namespace: dataphos +spec: + selector: + app: server + component: manager + ports: + - port: 8080 + targetPort: 8080 + type: ClusterIP +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-manager + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: server + component: manager + template: + metadata: + labels: + app: server + component: manager + annotations: + syntio.net/logme: "true" + spec: + containers: + - name: manager + image: syntioinc/dataphos-publisher-manager:1.0.0 + resources: + limits: + cpu: 100m + requests: + cpu: 5m + memory: 45Mi + ports: + - containerPort: 8080 + envFrom: + - configMapRef: + name: publisher-manager-config + - secretRef: + name: publisher-manager-secret + - configMapRef: + name: publisher-metadata-config + - secretRef: + name: publisher-metadata-secret +--- + +apiVersion: v1 +kind: Service +metadata: + name: publisher-data-fetcher + namespace: dataphos +spec: + selector: + app: server + component: data-fetcher + ports: + - protocol: TCP + port: 8081 +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-data-fetcher + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: server + component: data-fetcher + template: + metadata: + labels: + app: server + component: data-fetcher + annotations: + syntio.net/logme: "true" + spec: + initContainers: + - name: check-manager-health + image: curlimages/curl:7.85.0 + command: ['sh', '-c', 'while [ `curl -s -o /dev/null -w "%{http_code}" http://publisher-manager:8080` -ne 200 ]; do echo waiting for manager to be ready...; sleep 10; done;'] + containers: + - name: data-fetcher + image: syntioinc/dataphos-publisher-data-fetcher:1.0.0 + resources: + limits: + cpu: 600m + requests: + cpu: 200m + memory: 160Mi + ports: + - containerPort: 8081 + envFrom: + - configMapRef: + name: publisher-data-fetcher-config +--- + + +# Kubernetes Service Account +apiVersion: v1 +kind: ServiceAccount +metadata: + name: publisher-sa + namespace: dataphos +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: publisher-sa-role + namespace: dataphos +rules: + - apiGroups: [""] # "" indicates the core API group + resources: ["pods"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: publisher-sa-rb + namespace: dataphos +subjects: + - kind: ServiceAccount + name: publisher-sa +roleRef: + kind: Role + name: publisher-sa-role + apiGroup: rbac.authorization.k8s.io +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-scheduler + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: server + component: scheduler + template: + metadata: + labels: + app: server + component: scheduler + annotations: + syntio.net/logme: "true" + spec: + serviceAccountName: publisher-sa + containers: + - name: scheduler + image: syntioinc/dataphos-publisher-scheduler:1.0.0 + resources: + limits: + cpu: 100m + requests: + cpu: 5m + memory: 30Mi + envFrom: + - configMapRef: + name: publisher-scheduler-config + - configMapRef: + name: publisher-metadata-config + - secretRef: + name: publisher-metadata-secret +--- + +apiVersion: v1 +kind: Service +metadata: + name: publisher-webui + namespace: dataphos +spec: + selector: + app: webui + component: webui + ports: + - port: 8080 + type: NodePort +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-webui + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: webui + component: webui + template: + metadata: + labels: + app: webui + component: webui + spec: + containers: + - name: manager + image: syntioinc/dataphos-publisher-webui:1.0.0 + resources: + limits: + cpu: 100m + requests: + cpu: 5m + memory: 30Mi + ports: + - containerPort: 8080 + volumeMounts: + - name: publisher-webui-config-volume + mountPath: /usr/share/nginx/html/config.js + subPath: config.js + volumes: + - name: publisher-webui-config-volume + configMap: + name: publisher-webui-config + items: + - key: server.properties + path: config.js +``` +{{< /details >}} + + +## Publisher GCP +{{< details "YAML example" >}} + +``` +# Namespace +apiVersion: v1 +kind: Namespace +metadata: + name: dataphos +--- + +# Postgres metadata database +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-postgres-config + namespace: dataphos +data: + PGDATA: /var/lib/postgresql/data/pgdata +--- + +apiVersion: v1 +kind: Secret +metadata: + name: publisher-postgres-secret + namespace: dataphos +type: Opaque +stringData: + POSTGRES_DB: dataphos_publisher # insert your database name, same as METADATA_DATABASE in configuration.yaml + POSTGRES_USER: publisher # insert your database username, same as METADATA_USERNAME in configuration.yaml + POSTGRES_PASSWORD: samplePassworD1212 # insert your database user password, same as METADATA_PASSWORD in configuration.yaml +--- + +# Common configuration +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-metadata-config + namespace: dataphos +data: + METADATA_HOST: publisher-postgres.dataphos.svc + METADATA_PORT: "5432" + METADATA_DATABASE: publisher_metadata +--- + +apiVersion: v1 +kind: Secret +metadata: + name: publisher-metadata-secret + namespace: dataphos +type: Opaque +stringData: + METADATA_USERNAME: publisher # insert your database username + METADATA_PASSWORD: samplePassworD1212 # insert your database user password +--- + +# optional secret +apiVersion: v1 +kind: Secret +metadata: + name: pubsub-key + namespace: dataphos +type: Opaque +data: + "key.json": "" # insert your base64 encoded Pub/Sub service account key, leave empty if publishing to Pub/Sub + # not needed (optional) +--- + +# optional secret +apiVersion: v1 +kind: Secret +metadata: + name: kafka-tls-credentials + namespace: dataphos +type: Opaque +data: + "ca_crt.pem": "" # insert your base64 encoded Kafka cluster CA TLS certificate, leave empty if not needed (optional) + "client_crt.pem": "" # insert your base64 encoded Kafka user TLS certificate, leave empty if not needed (optional) + "client_key.pem": "" # insert your base64 encoded Kafka user TLS private key, leave empty if not needed (optional) +--- + +# optional secret +apiVersion: v1 +kind: Secret +metadata: + name: nats-tls-credentials + namespace: dataphos +type: Opaque +data: + "ca_crt.pem": "" # insert your base64 encoded Nats cluster CA TLS certificate, leave empty if not needed (optional) + "client_crt.pem": "" # insert your base64 encoded Nats user TLS certificate, leave empty if not needed (optional) + "client_key.pem": "" # insert your base64 encoded Nats user TLS private key, leave empty if not needed (optional) +--- + +# optional secret +apiVersion: v1 +kind: Secret +metadata: + name: pulsar-tls-credentials + namespace: dataphos +type: Opaque +data: + "ca_crt.pem": "" # insert your base64 encoded Nats cluster CA TLS certificate, leave empty if not needed (optional) + "client_crt.pem": "" # insert your base64 encoded Nats user TLS certificate, leave empty if not needed (optional) + "client_key.pem": "" # insert your base64 encoded Nats user TLS private key, leave empty if not needed (optional) +--- + +apiVersion: v1 +kind: Secret +metadata: + name: encryption-keys + namespace: dataphos +type: Opaque +stringData: # insert your encryption keys, one or more + "keys.yaml": | + ENC_KEY_1: "D2C0B5865AE141A49816F1FDC110FA5A" +--- + +# Manager +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-manager-config + namespace: dataphos +data: + WEB_UI: https:// # insert your webui domain name + FETCHER_URL: http://publisher-data-fetcher:8081 +--- + +apiVersion: v1 +kind: Secret +metadata: + name: publisher-manager-secret + namespace: dataphos +type: Opaque +stringData: + JWT_SECRET: SuperSecretPass! # insert your JWT secret key, 16 characters +--- + +# Data Fetcher +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-data-fetcher-config + namespace: dataphos +data: + MANAGER_URL: http://publisher-manager:8080 +--- + +# Scheduler +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-scheduler-config + namespace: dataphos +data: + WORKER_IMAGE: syntioinc/dataphos-publisher-worker:1.0.0 + FETCHER_URL: http://publisher-data-fetcher:8081 + SCHEMA_GENERATOR_URL: http://publisher-avro-schema-generator:8080 + SCHEMA_VALIDATION_URL: http:// # insert the schema registry public URL or an empty string if schema registry is not deployed + IMAGE_PULL_SECRET: regcred + KUBERNETES_NAMESPACE: dataphos + SECRET_NAME_PUBSUB: pubsub-key + SECRET_NAME_KAFKA: kafka-tls-credentials + SECRET_NAME_NATS: nats-tls-credentials + SECRET_NAME_PULSAR: pulsar-tls-credentials +--- + +# WebUI +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-webui-config + namespace: dataphos +data: + "server.properties": | + window.MANAGER_ENDPOINT = "/backend" +--- + +apiVersion: v1 +kind: Service +metadata: + name: publisher-postgres + namespace: dataphos +spec: + selector: + app: publisher-postgres-db + ports: + - port: 5432 +--- + +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: publisher-postgres-db + namespace: dataphos +spec: + serviceName: publisher-postgres + replicas: 1 + selector: + matchLabels: + app: publisher-postgres-db + template: + metadata: + labels: + app: publisher-postgres-db + spec: + containers: + - name: publisher-postgres + image: postgres:latest + ports: + - containerPort: 5432 + envFrom: + - configMapRef: + name: publisher-postgres-config + - secretRef: + name: publisher-postgres-secret + volumeMounts: + - name: publisher-postgres-volume + mountPath: /var/lib/postgresql/data + volumeClaimTemplates: + - metadata: + name: publisher-postgres-volume + namespace: dataphos + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 20Gi +--- + +# Initialize metadata database +apiVersion: batch/v1 +kind: Job +metadata: + name: publisher-initdb + namespace: dataphos +spec: + template: + spec: + containers: + - name: initdb + image: syntioinc/dataphos-publisher-initdb:1.0.0 + ports: + - containerPort: 5432 + envFrom: + - configMapRef: + name: publisher-metadata-config + - secretRef: + name: publisher-metadata-secret + restartPolicy: OnFailure + backoffLimit: 15 +--- + + +# Avro Schema Generator +apiVersion: v1 +kind: Service +metadata: + name: publisher-avro-schema-generator + namespace: dataphos +spec: + selector: + app: server + component: avro-schema-generator + ports: + - protocol: TCP + port: 8080 +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-avro-schema-generator + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: server + component: avro-schema-generator + template: + metadata: + labels: + app: server + component: avro-schema-generator + annotations: + syntio.net/logme: "true" + spec: + containers: + - name: avro-schema-generator + image: syntioinc/dataphos-publisher-avro-schema-generator:1.0.0 + resources: + limits: + cpu: 500m + requests: + cpu: 50m + memory: 250Mi +--- + +apiVersion: v1 +kind: Service +metadata: + name: publisher-manager + namespace: dataphos +spec: + selector: + app: server + component: manager + ports: + - port: 8080 + targetPort: 8080 + type: ClusterIP +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-manager + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: server + component: manager + template: + metadata: + labels: + app: server + component: manager + annotations: + syntio.net/logme: "true" + spec: + containers: + - name: manager + image: syntioinc/dataphos-publisher-manager:1.0.0 + resources: + limits: + cpu: 100m + requests: + cpu: 5m + memory: 45Mi + ports: + - containerPort: 8080 + envFrom: + - configMapRef: + name: publisher-manager-config + - secretRef: + name: publisher-manager-secret + - configMapRef: + name: publisher-metadata-config + - secretRef: + name: publisher-metadata-secret +--- + +apiVersion: v1 +kind: Service +metadata: + name: publisher-data-fetcher + namespace: dataphos +spec: + selector: + app: server + component: data-fetcher + ports: + - protocol: TCP + port: 8081 +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-data-fetcher + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: server + component: data-fetcher + template: + metadata: + labels: + app: server + component: data-fetcher + annotations: + syntio.net/logme: "true" + spec: + initContainers: + - name: check-manager-health + image: curlimages/curl:7.85.0 + command: ['sh', '-c', 'while [ `curl -s -o /dev/null -w "%{http_code}" http://publisher-manager:8080` -ne 200 ]; do echo waiting for manager to be ready...; sleep 10; done;'] + containers: + - name: data-fetcher + image: syntioinc/dataphos-publisher-data-fetcher:1.0.0 + resources: + limits: + cpu: 600m + requests: + cpu: 200m + memory: 160Mi + ports: + - containerPort: 8081 + envFrom: + - configMapRef: + name: publisher-data-fetcher-config +--- + + +# Kubernetes Service Account +apiVersion: v1 +kind: ServiceAccount +metadata: + name: publisher-sa + namespace: dataphos +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: publisher-sa-role + namespace: dataphos +rules: + - apiGroups: [""] # "" indicates the core API group + resources: ["pods"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: publisher-sa-rb + namespace: dataphos +subjects: + - kind: ServiceAccount + name: publisher-sa +roleRef: + kind: Role + name: publisher-sa-role + apiGroup: rbac.authorization.k8s.io +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-scheduler + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: server + component: scheduler + template: + metadata: + labels: + app: server + component: scheduler + annotations: + syntio.net/logme: "true" + spec: + serviceAccountName: publisher-sa + containers: + - name: scheduler + image: syntioinc/dataphos-publisher-scheduler:1.0.0 + resources: + limits: + cpu: 100m + requests: + cpu: 5m + memory: 30Mi + envFrom: + - configMapRef: + name: publisher-scheduler-config + - configMapRef: + name: publisher-metadata-config + - secretRef: + name: publisher-metadata-secret +--- + +apiVersion: v1 +kind: Service +metadata: + name: publisher-webui + namespace: dataphos +spec: + selector: + app: webui + component: webui + ports: + - port: 8080 + type: NodePort +--- + +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: publisher-webui-ingress + namespace: dataphos + annotations: + kubernetes.io/ingress.global-static-ip-name: # insert the name of your static IP address for Web UI ingress + ingress.gcp.kubernetes.io/pre-shared-cert: # insert the name of your Google managed certificate +spec: + rules: + - host: # insert your webui domain name, same as in the Manager config map + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: publisher-webui + port: + number: 8080 +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-webui + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: webui + component: webui + template: + metadata: + labels: + app: webui + component: webui + spec: + containers: + - name: manager + image: syntioinc/dataphos-publisher-webui:1.0.0 + resources: + limits: + cpu: 100m + requests: + cpu: 5m + memory: 30Mi + ports: + - containerPort: 8080 + volumeMounts: + - name: publisher-webui-config-volume + mountPath: /usr/share/nginx/html/config.js + subPath: config.js + volumes: + - name: publisher-webui-config-volume + configMap: + name: publisher-webui-config + items: + - key: server.properties + path: config.js +``` +{{< /details >}} + +## Publisher secrets +{{< details "YAML example" >}} + +``` +# Namespace +apiVersion: v1 +kind: Namespace +metadata: + name: dataphos +--- + +apiVersion: v1 +kind: Secret +metadata: + name: webui-tls-secret + namespace: dataphos +type: kubernetes.io/tls +stringData: + tls.crt: + tls.key: +``` +{{< /details >}} + +## v3 config +{{< details "YAML example" >}} + +``` +[req] +distinguished_name = req_distinguished_name +x509_extensions = v3_req +prompt = no +[req_distinguished_name] +C = +ST = +L = +O = +OU = +CN = +[v3_req] +keyUsage = nonRepudiation, digitalSignature, keyEncipherment +extendedKeyUsage = serverAuth +subjectAltName = @alt_names +[alt_names] +DNS.1 = +DNS.2 = +``` +{{< /details >}} diff --git a/dataphos-docs/content/referenced scripts/YAML examples/schemaregistry.md b/dataphos-docs/content/referenced scripts/YAML examples/schemaregistry.md new file mode 100644 index 0000000..7e913fb --- /dev/null +++ b/dataphos-docs/content/referenced scripts/YAML examples/schemaregistry.md @@ -0,0 +1,703 @@ +--- +title: "Schema Registry Examples" +draft: false +weight: 3 +--- + +# Dataphos Schema Registry + +## Schema Registry API +{{< details "YAML example" >}} + +``` +apiVersion: v1 +kind: Secret +metadata: + name: schema-registry-secret + namespace: dataphos +type: Opaque +stringData: + POSTGRES_PASSWORD: $postgres_password # insert password here + PGDATA: /data/pgdata + SR_HOST: schema-history-svc + SR_TABLE_PREFIX: syntio_schema. + SR_DBNAME: postgres + SR_USER: postgres + SERVER_PORT: "8080" + +--- +# Schema history service +apiVersion: "v1" +kind: "Service" +metadata: + name: "schema-history-svc" + namespace: dataphos +spec: + ports: + - protocol: "TCP" + port: 5432 + targetPort: 5432 + selector: + app: "schema-history" + type: "ClusterIP" +--- +# Schema history (PostgreSQL database that stores the schemas) +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: "schema-history" + namespace: dataphos + annotations: + "syntio.net/logme": "true" +spec: + serviceName: "schema-history-svc" + selector: + matchLabels: + app: "schema-history" + replicas: 1 + template: + metadata: + labels: + app: "schema-history" + spec: + containers: + - name: "schema-history" + image: postgres:latest + ports: + - containerPort: 5432 + env: + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: schema-registry-secret + key: POSTGRES_PASSWORD + - name: PGDATA + valueFrom: + secretKeyRef: + name: schema-registry-secret + key: PGDATA + volumeMounts: + - mountPath: /data + name: "schema-history-disk" + # Volume Claim + volumeClaimTemplates: + - metadata: + name: "schema-history-disk" + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 25Gi +--- +# Registry service +apiVersion: "v1" +kind: "Service" +metadata: + name: "schema-registry-svc" + namespace: dataphos +spec: + ports: + - name: http + port: 8080 + targetPort: http + - name: compatiblity + port: 8088 + targetPort: compatiblity + - name: validity + port: 8089 + targetPort: validity + selector: + app: "schema-registry" + type: "LoadBalancer" + loadBalancerIP: "" +--- +# Registry deployment +apiVersion: apps/v1 +kind: Deployment +metadata: + name: schema-registry + namespace: dataphos + annotations: + "syntio.net/logme": "true" +spec: + replicas: 1 + selector: + matchLabels: + app: schema-registry + template: + metadata: + labels: + app: schema-registry + spec: + volumes: + - name: google-cloud-key + secret: + secretName: service-account-credentials + initContainers: + - name: check-schema-history-health + image: busybox + command: [ + "/bin/sh", + "-c", + "until nc -zv schema-history-svc 5432 -w1; do echo 'waiting for db'; sleep 1; done" + ] + - name: initdb + image: syntioinc/dataphos-schema-registry-initdb:1.0.0 + env: + - name: SR_PASSWORD + valueFrom: + secretKeyRef: + name: schema-registry-secret + key: POSTGRES_PASSWORD + - name: SR_HOST + valueFrom: + secretKeyRef: + name: schema-registry-secret + key: SR_HOST + - name: SR_TABLE_PREFIX + valueFrom: + secretKeyRef: + name: schema-registry-secret + key: SR_TABLE_PREFIX + - name: SR_DBNAME + valueFrom: + secretKeyRef: + name: schema-registry-secret + key: SR_DBNAME + - name: SR_USER + valueFrom: + secretKeyRef: + name: schema-registry-secret + key: SR_USER + securityContext: + privileged: false + containers: + - name: gke-sr + image: syntioinc/dataphos-schema-registry-api:1.0.0 + env: + - name: SR_PASSWORD + valueFrom: + secretKeyRef: + name: schema-registry-secret + key: POSTGRES_PASSWORD + - name: SR_HOST + valueFrom: + secretKeyRef: + name: schema-registry-secret + key: SR_HOST + - name: SR_TABLE_PREFIX + valueFrom: + secretKeyRef: + name: schema-registry-secret + key: SR_TABLE_PREFIX + - name: SR_DBNAME + valueFrom: + secretKeyRef: + name: schema-registry-secret + key: SR_DBNAME + - name: SR_USER + valueFrom: + secretKeyRef: + name: schema-registry-secret + key: SR_USER + - name: SERVER_PORT + valueFrom: + secretKeyRef: + name: schema-registry-secret + key: SERVER_PORT + - name: COMPATIBILITY_CHECKER_URL + value: "http://localhost:8088" + - name: VALIDITY_CHECKER_URL + value: "http://localhost:8089" + resources: + limits: + cpu: "400m" + memory: "500Mi" + requests: + cpu: "400m" + memory: "500Mi" + ports: + - name: http + containerPort: 8080 + - name: compatibility-checker + image: syntioinc/dataphos-schema-registry-compatibility:1.0.0 + ports: + - name: compatibility + containerPort: 8088 + - name: validity-checker + image: syntioinc/dataphos-schema-registry-validity:1.0.0 + ports: + - name: validity + containerPort: 8089 +--- +``` +{{< /details >}} + +## Schema Registry Validator General +{{< details "YAML example" >}} + +``` +apiVersion: v1 +kind: ConfigMap +metadata: + name: centralconsumer-config + namespace: dataphos +data: +# Uncomment the type you want to use and fill the values for it + +# CONSUMER_TYPE: "kafka" +# CONSUMER_KAFKA_ADDRESS: +# CONSUMER_KAFKA_TOPIC: +# CONSUMER_KAFKA_GROUP_ID: + +# CONSUMER_TYPE: "pubsub" +# CONSUMER_PUBSUB_PROJECT_ID: +# CONSUMER_PUBSUB_SUBSCRIPTION_ID: + +# CONSUMER_TYPE: "servicebus" +# CONSUMER_SERVICEBUS_CONNECTION_STRING: +# CONSUMER_SERVICEBUS_TOPIC: +# CONSUMER_SERVICEBUS_SUBSCRIPTION: + + +# PRODUCER_TYPE: "kafka" +# PRODUCER_KAFKA_ADDRESS: + +# PRODUCER_TYPE: "pubsub" +# PRODUCER_PUBSUB_PROJECT_ID: + +# PRODUCER_TYPE: "servicebus" +# PRODUCER_SERVICEBUS_CONNECTION_STRING: + + TOPICS_VALID: + TOPICS_DEAD_LETTER: + + REGISTRY_URL: "http://schema-registry-svc:8080" + VALIDATORS_ENABLE_JSON: + VALIDATORS_ENABLE_AVRO: + VALIDATORS_ENABLE_PROTOBUF: + VALIDATORS_ENABLE_CSV: + VALIDATORS_CSV_URL: "http://csv-validator-svc:8080" + VALIDATORS_ENABLE_XML: + VALIDATORS_XML_URL: "http://xml-validator-svc:8081" + +--- +apiVersion: "v1" +kind: "Service" +metadata: + name: "metrics" + namespace: dataphos +spec: + ports: + - name: metrics + port: 2112 + targetPort: 2112 + selector: + app: "centralconsumer" + type: "LoadBalancer" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: centralconsumer + namespace: dataphos + annotations: + "syntio.net/logme": "true" +spec: + replicas: 1 + selector: + matchLabels: + app: centralconsumer + template: + metadata: + labels: + app: centralconsumer + spec: + volumes: + - name: google-cloud-key + secret: + secretName: service-account-credentials + containers: + - name: centralconsumer + image: syntioinc/dataphos-schema-registry-validator:1.0.0 + resources: + limits: + cpu: "125m" + memory: "80Mi" + requests: + cpu: "125m" + memory: "40Mi" + volumeMounts: + - mountPath: /var/secrets/google + name: google-cloud-key + envFrom: + - configMapRef: + name: centralconsumer-config + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /var/secrets/google/key.json + ports: + - name: metrics + containerPort: 2112 + imagePullSecrets: + - name: nexuscred + +--- +``` +{{< /details >}} + +## Schema Registry Validator Kafka +{{< details "YAML example" >}} + +``` +apiVersion: v1 +kind: ConfigMap +metadata: + name: centralconsumer-config + namespace: dataphos +data: + CONSUMER_TYPE: "kafka" + CONSUMER_KAFKA_ADDRESS: $consumer_address # insert consumer bootstrap server here + CONSUMER_KAFKA_TOPIC: $consumer_topic # insert consumer topic + CONSUMER_KAFKA_GROUP_ID: $consumer_group_id # insert consumer group ID + + PRODUCER_TYPE: "kafka" + PRODUCER_KAFKA_ADDRESS: $producer_address # insert producer bootstrap server here + + TOPICS_VALID: $producer_valid_topic_ID # insert producer valid topic + TOPICS_DEAD_LETTER: $producer_deadletter_topic_ID # insert producer dead-letter topic + + REGISTRY_URL: "http://schema-registry-svc:8080" + VALIDATORS_ENABLE_JSON: "true" + VALIDATORS_ENABLE_AVRO: "false" + VALIDATORS_ENABLE_PROTOBUF: "false" + VALIDATORS_ENABLE_CSV: "false" + VALIDATORS_CSV_URL: "http://csv-validator-svc:8080" + VALIDATORS_ENABLE_XML: "false" + VALIDATORS_XML_URL: "http://xml-validator-svc:8081" + +--- +apiVersion: "v1" +kind: "Service" +metadata: + name: "metrics" + namespace: dataphos +spec: + ports: + - name: metrics + port: 2112 + targetPort: 2112 + selector: + app: "centralconsumer" + type: "LoadBalancer" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: centralconsumer + namespace: dataphos + annotations: + "syntio.net/logme": "true" +spec: + replicas: 1 + selector: + matchLabels: + app: centralconsumer + template: + metadata: + labels: + app: centralconsumer + containers: + - name: centralconsumer + image: syntioinc/dataphos-schema-registry-validator:1.0.0 + resources: + limits: + cpu: "125m" + memory: "80Mi" + requests: + cpu: "125m" + memory: "40Mi" + envFrom: + - configMapRef: + name: centralconsumer-config + ports: + - name: metrics + containerPort: 2112 + imagePullSecrets: + - name: nexuscred +--- +``` +{{< /details >}} + +## Schema Registry Validator Kafka To Pubsub +{{< details "YAML example" >}} +``` +apiVersion: v1 +kind: ConfigMap +metadata: + name: centralconsumer-config + namespace: dataphos +data: + CONSUMER_TYPE: "kafka" + CONSUMER_KAFKA_ADDRESS: $consumer_address # insert consumer bootstrap server here + CONSUMER_KAFKA_TOPIC: $consumer_topic # insert consumer topic + CONSUMER_KAFKA_GROUP_ID: $consumer_group_id # insert consumer group ID + + PRODUCER_TYPE: "pubsub" + PRODUCER_PUBSUB_PROJECT_ID: $producer_project_ID # insert GCP project ID + + TOPICS_VALID: $producer_valid_topic_ID # insert producer valid topic + TOPICS_DEAD_LETTER: $producer_deadletter_topic_ID # insert producer dead-letter topic + + REGISTRY_URL: "http://schema-registry-svc:8080" + VALIDATORS_ENABLE_JSON: "true" + VALIDATORS_ENABLE_AVRO: "false" + VALIDATORS_ENABLE_PROTOBUF: "false" + VALIDATORS_ENABLE_CSV: "false" + VALIDATORS_CSV_URL: "http://csv-validator-svc:8080" + VALIDATORS_ENABLE_XML: "false" + VALIDATORS_XML_URL: "http://xml-validator-svc:8081" + +--- +apiVersion: "v1" +kind: "Service" +metadata: + name: "metrics" + namespace: dataphos +spec: + ports: + - name: metrics + port: 2112 + targetPort: 2112 + selector: + app: "centralconsumer" + type: "LoadBalancer" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: centralconsumer + namespace: dataphos + annotations: + "syntio.net/logme": "true" +spec: + replicas: 1 + selector: + matchLabels: + app: centralconsumer + template: + metadata: + labels: + app: centralconsumer + spec: + volumes: + - name: google-cloud-key + secret: + secretName: service-account-credentials + containers: + - name: centralconsumer + image: syntioinc/dataphos-schema-registry-validator:1.0.0 + resources: + limits: + cpu: "125m" + memory: "80Mi" + requests: + cpu: "125m" + memory: "40Mi" + volumeMounts: + - mountPath: /var/secrets/google + name: google-cloud-key + envFrom: + - configMapRef: + name: centralconsumer-config + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /var/secrets/google/key.json + ports: + - name: metrics + containerPort: 2112 + imagePullSecrets: + - name: nexuscred + +--- +``` +{{< /details >}} + +## Schema Registry Validator Pubsub +{{< details "YAML example" >}} + +``` +apiVersion: v1 +kind: ConfigMap +metadata: + name: centralconsumer-config + namespace: dataphos +data: + CONSUMER_TYPE: "pubsub" + CONSUMER_PUBSUB_PROJECT_ID: $consumer_project_ID # insert consumer GCP project ID + CONSUMER_PUBSUB_SUBSCRIPTION_ID: $consumer_subscription_ID # insert producer pubsub subscription ID + PRODUCER_TYPE: "pubsub" + PRODUCER_PUBSUB_PROJECT_ID: $producer_project_ID # insert producer GCP project ID + + TOPICS_VALID: $producer_valid_topic_ID # insert producer valid topic + TOPICS_DEAD_LETTER: $producer_deadletter_topic_ID # insert producer dead-letter topic + + REGISTRY_URL: "http://schema-registry-svc:8080" + VALIDATORS_ENABLE_JSON: "true" + VALIDATORS_ENABLE_AVRO: "false" + VALIDATORS_ENABLE_PROTOBUF: "false" + VALIDATORS_ENABLE_CSV: "false" + VALIDATORS_CSV_URL: "http://csv-validator-svc:8080" + VALIDATORS_ENABLE_XML: "false" + VALIDATORS_XML_URL: "http://xml-validator-svc:8081" + +--- + +apiVersion: "v1" +kind: "Service" +metadata: + name: "metrics" + namespace: dataphos +spec: + ports: + - name: metrics + port: 2112 + targetPort: 2112 + selector: + app: "centralconsumer" + type: "LoadBalancer" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: centralconsumer + namespace: dataphos + annotations: + "syntio.net/logme": "true" +spec: + replicas: 1 + selector: + matchLabels: + app: centralconsumer + template: + metadata: + labels: + app: centralconsumer + spec: + volumes: + - name: google-cloud-key + secret: + secretName: service-account-credentials + containers: + - name: centralconsumer + image: syntioinc/dataphos-schema-registry-validator:1.0.0 + resources: + limits: + cpu: "125m" + memory: "80Mi" + requests: + cpu: "125m" + memory: "40Mi" + volumeMounts: + - mountPath: /var/secrets/google + name: google-cloud-key + envFrom: + - configMapRef: + name: centralconsumer-config + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /var/secrets/google/key.json + ports: + - name: metrics + containerPort: 2112 + imagePullSecrets: + - name: nexuscred + +--- +``` +{{< /details >}} + +## Schema Registry Validator Service Bus +{{< details "YAML example" >}} + +``` +apiVersion: v1 +kind: ConfigMap +metadata: + name: centralconsumer-config + namespace: dataphos +data: + CONSUMER_TYPE: "servicebus" + CONSUMER_SERVICEBUS_CONNECTION_STRING: $consumer_servicebus_connection_string # insert the consumer service bus connection string + CONSUMER_SERVICEBUS_TOPIC: consumer_servicebus_topic # insert te consumer service bus topic + CONSUMER_SERVICEBUS_SUBSCRIPTION: $consumer_servicebus_subscription # insert te consumer service bus subsription + + PRODUCER_TYPE: "servicebus" + PRODUCER_SERVICEBUS_CONNECTION_STRING: $producer_servicebus_connection_string # insert the producer service bus connection string + + TOPICS_VALID: $producer_valid_topic_ID # insert producer valid topic + TOPICS_DEAD_LETTER: $producer_deadletter_topic_ID # insert producer dead-letter topic + + REGISTRY_URL: "http://schema-registry-svc:8080" + VALIDATORS_ENABLE_JSON: "true" + VALIDATORS_ENABLE_AVRO: "false" + VALIDATORS_ENABLE_PROTOBUF: "false" + VALIDATORS_ENABLE_CSV: "false" + VALIDATORS_CSV_URL: "http://csv-validator-svc:8080" + VALIDATORS_ENABLE_XML: "false" + VALIDATORS_XML_URL: "http://xml-validator-svc:8081" + +--- +apiVersion: "v1" +kind: "Service" +metadata: + name: "metrics" + namespace: dataphos +spec: + ports: + - name: metrics + port: 2112 + targetPort: 2112 + selector: + app: "centralconsumer" + type: "LoadBalancer" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: centralconsumer + namespace: dataphos + annotations: + "syntio.net/logme": "true" +spec: + replicas: 1 + selector: + matchLabels: + app: centralconsumer + template: + metadata: + labels: + app: centralconsumer + spec: + containers: + - name: centralconsumer + image: syntioinc/dataphos-schema-registry-validator:1.0.0 + resources: + limits: + cpu: "125m" + memory: "80Mi" + requests: + cpu: "125m" + memory: "40Mi" + envFrom: + - configMapRef: + name: centralconsumer-config + ports: + - name: metrics + containerPort: 2112 + imagePullSecrets: + - name: nexuscred + +--- +``` +{{< /details >}} diff --git a/dataphos-docs/content/referenced scripts/_index.md b/dataphos-docs/content/referenced scripts/_index.md new file mode 100644 index 0000000..c684ffe --- /dev/null +++ b/dataphos-docs/content/referenced scripts/_index.md @@ -0,0 +1,9 @@ +--- +title: "Dataphos Scripts" +draft: false +weight: 5 +--- + +Scripts for deploying Dataphos. + +{{< toc-tree >}} \ No newline at end of file diff --git a/dataphos-docs/content/referenced scripts/deployment scripts/_index.md b/dataphos-docs/content/referenced scripts/deployment scripts/_index.md new file mode 100644 index 0000000..3370f02 --- /dev/null +++ b/dataphos-docs/content/referenced scripts/deployment scripts/_index.md @@ -0,0 +1,7 @@ +--- +title: "Deployment Scripts" +draft: false +weight: 3 +geekdocCollapseSection: true +--- +{{< toc-tree >}} \ No newline at end of file diff --git a/dataphos-docs/content/referenced scripts/deployment scripts/persistor.md b/dataphos-docs/content/referenced scripts/deployment scripts/persistor.md new file mode 100644 index 0000000..d30f84a --- /dev/null +++ b/dataphos-docs/content/referenced scripts/deployment scripts/persistor.md @@ -0,0 +1,1598 @@ +--- +title: "Persistor Scripts" +draft: false +--- + +# Dataphos Persistor + +## Persistor Azure +{{< details "Deployment Script" >}} + +``` +#!/bin/bash +if [ $# -ne 12 ]; then + echo "please specify all required variables" + exit 1 +fi +client_id=$1 +client_secret=$2 +tenant_id=$3 +persistor_sb_conn_string=$4 +persistor_topic=$5 +persistor_sub=$6 +deadletter_topic=$7 +storage_account=$8 +container=$9 +indexer_sb_conn_string=${10} +indexer_topic=${11} +indexer_sub=${12} + +kubectl apply -f - <}} + + +## Persistor GCP +{{< details "Deployment Script" >}} + +``` +#!/bin/bash +if [ $# -ne 8 ]; then + echo "please specify all required variables" + exit 1 +fi +project_id=$1 +persistor_topic=$2 +persistor_sub=$3 +bucket=$4 +deadletter_topic=$5 +indexer_topic=$6 +indexer_sub=$7 +path_to_key_file=$8 + +kubectl create secret generic per-gcp-access -n dataphos --from-file=key.json=$path_to_key_file +kubectl apply -f - <}} + +## Persistor Kafka to Azure Blog Storage +{{< details "Deployment Script" >}} +``` +#!/bin/bash +if [ $# -ne 12 ]; then + echo "please specify all required variables" + exit 1 +fi +client_id=$1 +client_secret=$2 +tenant_id=$3 +persistor_kafka_address=$4 +persistor_topic=$5 +kafka_consumer_persistor_group_id=$6 +storage_account=$7 +container=$8 +deadletter_topic=$9 +indexer_kafka_address=${10} +indexer_topic=${11} +kafka_consumer_indexer_group_id=${12} + +kubectl apply -f - <}} + +## Persistor Kafka to GCS + +{{< details "Deployment Script" >}} + +``` +#!/bin/bash +if [ $# -ne 10 ]; then + echo "please specify all required variables" + exit 1 +fi +project_id=$1 +persistor_kafka_address=$2 +persistor_topic=$3 +kafka_consumer_persistor_group_id=$4 +bucket=$5 +deadletter_topic=$6 +indexer_kafka_address=$7 +indexer_topic=$8 +kafka_consumer_indexer_group_id=$9 +path_to_key_file=${10} + +kubectl create secret generic per-gcp-access -n dataphos --from-file=key.json=$path_to_key_file +kubectl apply -f - <}} +## Delete deployment + +{{< details "Deletion Script" >}} + +``` +#!/bin/bash +kubectl delete secret per-gcp-access -n dataphos +kubectl delete -f - <}} diff --git a/dataphos-docs/content/referenced scripts/deployment scripts/prometheus.md b/dataphos-docs/content/referenced scripts/deployment scripts/prometheus.md new file mode 100644 index 0000000..c268d7a --- /dev/null +++ b/dataphos-docs/content/referenced scripts/deployment scripts/prometheus.md @@ -0,0 +1,249 @@ +--- +title: "Prometheus Scripts" +draft: false +--- + +# Prometheus +Deployment script for the Prometheus server for gathering the metrics and monitoring the logs. + +## Bash +{{< details "Deployment Script" >}} +``` +#!/bin/bash +if [ $# -ne 1 ]; then + echo "please specify all required variables" + exit 1 +fi + +namespace=$1 + +kubectl apply -f - < 1 + for: 1m + labels: + severity: slack + annotations: + summary: High Memory Usage + prometheus.yml: |- + global: + scrape_interval: 5s + evaluation_interval: 5s + rule_files: + - /etc/prometheus/prometheus.rules + alerting: + alertmanagers: + - scheme: http + static_configs: + - targets: + - "alertmanager.$namespace.svc:9093" + scrape_configs: + - job_name: schema-registry + scrape_interval: 5s + metrics_path: "/metrics" + static_configs: + - targets: ["metrics:2112"] + - job_name: 'node-exporter' + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [__meta_kubernetes_endpoints_name] + regex: 'node-exporter' + action: keep + - job_name: 'kubernetes-apiservers' + kubernetes_sd_configs: + - role: endpoints + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: default;kubernetes;https + - job_name: 'kubernetes-nodes' + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics + - job_name: 'kubernetes-pods' + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name + - job_name: 'kube-state-metrics' + static_configs: + - targets: ['kube-state-metrics.kube-system.svc.cluster.local:8080'] + - job_name: 'kubernetes-cadvisor' + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + - job_name: 'kubernetes-service-endpoints' + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: kubernetes_name + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus-deployment + namespace: $namespace + labels: + app: prometheus-server +spec: + replicas: 1 + selector: + matchLabels: + app: prometheus-server + template: + metadata: + labels: + app: prometheus-server + spec: + containers: + - name: prometheus + image: prom/prometheus + args: + - "--storage.tsdb.retention.time=12h" + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.path=/prometheus/" + ports: + - containerPort: 9090 + resources: + requests: + cpu: 500m + memory: 500M + limits: + cpu: 1 + memory: 1Gi + volumeMounts: + - name: prometheus-config-volume + mountPath: /etc/prometheus/ + - name: prometheus-storage-volume + mountPath: /prometheus/ + volumes: + - name: prometheus-config-volume + configMap: + defaultMode: 420 + name: prometheus-server-conf + + - name: prometheus-storage-volume + emptyDir: {} +--- +EOF +``` +{{< /details >}} diff --git a/dataphos-docs/content/referenced scripts/deployment scripts/publisher.md b/dataphos-docs/content/referenced scripts/deployment scripts/publisher.md new file mode 100644 index 0000000..f71265e --- /dev/null +++ b/dataphos-docs/content/referenced scripts/deployment scripts/publisher.md @@ -0,0 +1,1071 @@ +--- +title: "Publisher Scripts" +draft: true +--- + +# Dataphos Publisher + +{{< details "Deployment Script" >}} +## Bash +``` +#!/bin/bash + +Help() +{ +echo "Flags:" +echo "-n - the namespace where Publisher will be deployed" +echo "-u - username for the Postgres metadata database" +echo "-p - password for the Postgres metadata database" +} + +if [ $# -eq 1 ] & [ $1 == "--help" ]; then + Help + exit +fi + +if [ $# -ne 6 ]; then + echo "Please specify all required variables" + exit 1 +fi + +while getopts n:u:p: flag +do + case "${flag}" in + n) namespace=${OPTARG};; + u) postgres_user=${OPTARG};; + p) postgres_password=${OPTARG};; + esac +done + +kubectl apply -f - <}} + +## PowerShell +{{< details "Deployment Script" >}} +``` +#! /usr/bin/pwsh + +if($args.Count -ne 3){ + Write-Host "please specify all required variables" + exit 1 +} + +$namespace=$args[0] +$postgres_user=$args[1] +$postgres_password=$args[2] + +$myYaml = @" +# Manager and Web UI services +apiVersion: v1 +kind: Service +metadata: + name: publisher-manager + namespace: $namespace +spec: + selector: + app: server + component: manager + ports: + - port: 8080 + targetPort: 8080 + type: ClusterIP +--- +apiVersion: v1 +kind: Service +metadata: + name: publisher-webui + namespace: $namespace +spec: + selector: + app: webui + component: webui + ports: + - port: 8080 + type: LoadBalancer +--- +# Postgres metadata database +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-postgres-config + namespace: $namespace +data: + PGDATA: /var/lib/postgresql/data/pgdata +--- +apiVersion: v1 +kind: Secret +metadata: + name: publisher-postgres-secret + namespace: $namespace +type: Opaque +stringData: + POSTGRES_USER: $postgres_user + POSTGRES_PASSWORD: $postgres_password +--- +apiVersion: v1 +kind: Service +metadata: + name: publisher-postgres + namespace: $namespace +spec: + selector: + app: publisher-postgres-db + ports: + - port: 5432 +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: publisher-postgres-db + namespace: $namespace +spec: + serviceName: publisher-postgres + replicas: 1 + selector: + matchLabels: + app: publisher-postgres-db + template: + metadata: + labels: + app: publisher-postgres-db + spec: + containers: + - name: publisher-postgres + image: postgres:latest + ports: + - containerPort: 5432 + envFrom: + - configMapRef: + name: publisher-postgres-config + - secretRef: + name: publisher-postgres-secret + volumeMounts: + - name: publisher-postgres-volume + mountPath: /var/lib/postgresql/data + volumeClaimTemplates: + - metadata: + name: publisher-postgres-volume + namespace: $namespace + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 20Gi +--- +# Common configuration +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-metadata-config + namespace: $namespace +data: + METADATA_HOST: publisher-postgres + METADATA_PORT: "5432" + METADATA_DATABASE: publisher_metadata +--- +apiVersion: v1 +kind: Secret +metadata: + name: publisher-metadata-secret + namespace: $namespace +type: Opaque +stringData: + METADATA_USERNAME: $postgres_user # insert your database username + METADATA_PASSWORD: $postgres_password # insert your database user password +--- +# optional secret +apiVersion: v1 +kind: Secret +metadata: + name: pubsub-key + namespace: $namespace +type: Opaque +data: + "key.json": "" # insert your base64 encoded Pub/Sub service account key, leave empty if publishing to Pub/Sub + # not needed (optional) +--- +# optional secret +apiVersion: v1 +kind: Secret +metadata: + name: kafka-tls-credentials + namespace: $namespace +type: Opaque +data: + "ca_crt.pem": "" # insert your base64 encoded Kafka cluster CA TLS certificate, leave empty if not needed (optional) + "client_crt.pem": "" # insert your base64 encoded Kafka user TLS certificate, leave empty if not needed (optional) + "client_key.pem": "" # insert your base64 encoded Kafka user TLS private key, leave empty if not needed (optional) +--- +# optional secret +apiVersion: v1 +kind: Secret +metadata: + name: nats-tls-credentials + namespace: $namespace +type: Opaque +data: + "ca_crt.pem": "" # insert your base64 encoded Nats cluster CA TLS certificate, leave empty if not needed (optional) + "client_crt.pem": "" # insert your base64 encoded Nats user TLS certificate, leave empty if not needed (optional) + "client_key.pem": "" # insert your base64 encoded Nats user TLS private key, leave empty if not needed (optional) +--- +# optional secret +apiVersion: v1 +kind: Secret +metadata: + name: pulsar-tls-credentials + namespace: $namespace +type: Opaque +data: + "ca_crt.pem": "" # insert your base64 encoded Nats cluster CA TLS certificate, leave empty if not needed (optional) + "client_crt.pem": "" # insert your base64 encoded Nats user TLS certificate, leave empty if not needed (optional) + "client_key.pem": "" # insert your base64 encoded Nats user TLS private key, leave empty if not needed (optional) +--- +apiVersion: v1 +kind: Secret +metadata: + name: encryption-keys + namespace: $namespace +type: Opaque +stringData: # insert your encryption keys, one or more + "keys.yaml": | + ENC_KEY_1: "D2C0B5865AE141A49816F1FDC110FA5A" +--- +# Initialize metadata database +apiVersion: batch/v1 +kind: Job +metadata: + name: publisher-initdb + namespace: $namespace +spec: + template: + spec: + containers: + - name: initdb + image: syntioinc/dataphos-publisher-initdb:1.0.0 + ports: + - containerPort: 5432 + envFrom: + - configMapRef: + name: publisher-metadata-config + - secretRef: + name: publisher-metadata-secret + restartPolicy: OnFailure + backoffLimit: 15 +--- +# Avro Schema Generator +apiVersion: v1 +kind: Service +metadata: + name: publisher-avro-schema-generator + namespace: $namespace +spec: + selector: + app: server + component: avro-schema-generator + ports: + - protocol: TCP + port: 8080 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-avro-schema-generator + namespace: $namespace +spec: + replicas: 1 + selector: + matchLabels: + app: server + component: avro-schema-generator + template: + metadata: + labels: + app: server + component: avro-schema-generator + annotations: + syntio.net/logme: "true" + spec: + containers: + - name: avro-schema-generator + image: syntioinc/dataphos-publisher-avro-schema-generator:1.0.0 + resources: + limits: + cpu: 500m + requests: + cpu: 50m + memory: 250Mi +--- +# Kubernetes Service Account +apiVersion: v1 +kind: ServiceAccount +metadata: + name: publisher-sa + namespace: $namespace +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: publisher-sa-role + namespace: $namespace +rules: + - apiGroups: [""] # "" indicates the core API group + resources: ["pods"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: publisher-sa-rb + namespace: $namespace +subjects: + - kind: ServiceAccount + name: publisher-sa +roleRef: + kind: Role + name: publisher-sa-role + apiGroup: rbac.authorization.k8s.io +--- +# Scheduler +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-scheduler-config + namespace: $namespace +data: + WORKER_IMAGE: syntioinc/dataphos-publisher-worker:1.0.0 + FETCHER_URL: http://publisher-data-fetcher:8081 + SCHEMA_GENERATOR_URL: http://publisher-avro-schema-generator:8080 + SCHEMA_VALIDATION_URL: "" # insert the schema registry public URL or an empty string if schema registry is not deployed + IMAGE_PULL_SECRET: regcred + KUBERNETES_NAMESPACE: $namespace + SECRET_NAME_PUBSUB: pubsub-key + SECRET_NAME_KAFKA: kafka-tls-credentials + SECRET_NAME_NATS: nats-tls-credentials + SECRET_NAME_PULSAR: pulsar-tls-credentials +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-scheduler + namespace: $namespace +spec: + replicas: 1 + selector: + matchLabels: + app: server + component: scheduler + template: + metadata: + labels: + app: server + component: scheduler + annotations: + syntio.net/logme: "true" + spec: + serviceAccountName: publisher-sa + containers: + - name: scheduler + image: syntioinc/dataphos-publisher-scheduler:1.0.0 + resources: + limits: + cpu: 100m + requests: + cpu: 5m + memory: 30Mi + envFrom: + - configMapRef: + name: publisher-scheduler-config + - configMapRef: + name: publisher-metadata-config + - secretRef: + name: publisher-metadata-secret +--- +"@ + +$myYaml |kubectl.exe apply -f - +do{ + $webui_ip = $(kubectl get services --namespace $namespace publisher-webui --output jsonpath='{.status.loadBalancer.ingress[0].ip}') + Write-Host "Waiting for Web UI service to be created..." + Start-Sleep -Seconds 10 +}while($null -eq $webui_ip) +Write-Host "$webui_ip" + +$oneYaml = @" +# Manager +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-manager-config + namespace: $namespace +data: + WEB_UI: http://$($webui_ip):8080 + FETCHER_URL: http://publisher-data-fetcher:8081 +--- +apiVersion: v1 +kind: Secret +metadata: + name: publisher-manager-secret + namespace: $namespace +type: Opaque +stringData: + JWT_SECRET: "DuperSecretPass!" # insert your JWT secret key, 16 characters +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-manager + namespace: $namespace +spec: + replicas: 1 + selector: + matchLabels: + app: server + component: manager + template: + metadata: + labels: + app: server + component: manager + annotations: + syntio.net/logme: "true" + spec: + containers: + - name: manager + image: syntioinc/dataphos-publisher-manager:1.0.0 + resources: + limits: + cpu: 100m + requests: + cpu: 5m + memory: 45Mi + ports: + - containerPort: 8080 + envFrom: + - configMapRef: + name: publisher-manager-config + - secretRef: + name: publisher-manager-secret + - configMapRef: + name: publisher-metadata-config + - secretRef: + name: publisher-metadata-secret +--- +# WebUI +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-webui-config + namespace: $namespace +data: # insert your manager domain name + "server.properties": | + window.MANAGER_ENDPOINT = "/backend" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-webui + namespace: $namespace +spec: + replicas: 1 + selector: + matchLabels: + app: webui + component: webui + template: + metadata: + labels: + app: webui + component: webui + spec: + containers: + - name: manager + image: syntioinc/dataphos-publisher-webui:1.0.0 + resources: + limits: + cpu: 100m + requests: + cpu: 5m + memory: 30Mi + ports: + - containerPort: 8080 + volumeMounts: + - name: publisher-webui-config-volume + mountPath: /usr/share/nginx/html/config.js + subPath: config.js + volumes: + - name: publisher-webui-config-volume + configMap: + name: publisher-webui-config + items: + - key: server.properties + path: config.js +"@ + +$oneYaml |kubectl.exe apply -f - + +$newYaml = @" +# Data Fetcher +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-data-fetcher-config + namespace: $namespace +data: + MANAGER_URL: http://publisher-manager:8080 +--- +apiVersion: v1 +kind: Service +metadata: + name: publisher-data-fetcher + namespace: $namespace +spec: + selector: + app: server + component: data-fetcher + ports: + - protocol: TCP + port: 8081 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-data-fetcher + namespace: $namespace +spec: + replicas: 1 + selector: + matchLabels: + app: server + component: data-fetcher + template: + metadata: + labels: + app: server + component: data-fetcher + annotations: + syntio.net/logme: "true" + spec: + containers: + - name: data-fetcher + image: syntioinc/dataphos-publisher-data-fetcher:1.0.0 + resources: + limits: + cpu: 600m + requests: + cpu: 200m + memory: 160Mi + ports: + - containerPort: 8081 + envFrom: + - configMapRef: + name: publisher-data-fetcher-config +--- +"@ +$newYaml |kubectl.exe apply -f - +``` +{{< /details >}} diff --git a/dataphos-docs/content/referenced scripts/deployment scripts/schemaregistry.md b/dataphos-docs/content/referenced scripts/deployment scripts/schemaregistry.md new file mode 100644 index 0000000..d34a72b --- /dev/null +++ b/dataphos-docs/content/referenced scripts/deployment scripts/schemaregistry.md @@ -0,0 +1,2007 @@ +--- +title: "Schema Registry Scripts" +draft: false +--- + +# Dataphos Schema Registry + +## Schema Registry API +{{< details "Schema Registry API" >}} +``` +#!/bin/bash + +if [ $# -ne 2 ]; then + echo "please specify all required variables" + exit 1 +fi + +namespace=$1 +postgres_password=$2 + +kubectl apply -f - <}} +## Delete Schema Registry API +{{< details "Delete Script" >}} +``` +#!/bin/bash + +if [ $# -ne 1 ]; then + echo "please specify all required variables" + exit 1 +fi + +namespace=$1 + +kubectl delete -f - <}} +## Schema Registry Validator Kafka +{{< details "Deployment Script" >}} +``` +#!/bin/bash + +if [ $# -ne 8 ]; then + echo "please specify all required variables" + exit 1 +fi + +namespace=$1 +producer_valid_topic_ID=$2 +producer_deadletter_topic_ID=$3 +message_type=$4 +consumer_address=$5 +consumer_topic=$6 +consumer_group_id=$7 +producer_address=$8 + +supported_message_types=("json", "avro", "protobuf", "xml", "csv") +if echo "${supported_message_types[@]}" | grep -qw "$message_type"; then + echo "supported message type" +else + echo "unsupported message type" + exit 1 +fi + +deploy_xml_validator () { + kubectl apply -f - <}} + +## Delete Schema Registry Validator Kafka +{{< details "Deletion Script" >}} + +``` +#!/bin/bash + +if [ $# -ne 2 ]; then + echo "please specify all required variables" + exit 1 +fi + +namespace=$1 +message_type=$2 + +supported_message_types=("json", "avro", "protobuf", "xml", "csv") +if echo "${supported_message_types[@]}" | grep -qw "$message_type"; then + echo "supported message type" +else + echo "unsupported message type" + exit 1 +fi + +delete_xml_validator () { + kubectl delete -f - <}} + +## Schema Registry Validator Kafka to PubSub +{{< details "Deployment Script" >}} + +``` +#!/bin/bash + +if [ $# -ne 8 ]; then + echo "please specify all required variables" + exit 1 +fi + +namespace=$1 +producer_valid_topic_ID=$2 +producer_deadletter_topic_ID=$3 +message_type=$4 +consumer_address=$5 +consumer_topic=$6 +consumer_group_id=$7 +producer_project_ID=$8 + +supported_message_types=("json", "avro", "protobuf", "xml", "csv") +if echo "${supported_message_types[@]}" | grep -qw "$message_type"; then + echo "supported message type" +else + echo "unsupported message type" + exit 1 +fi + +deploy_xml_validator () { + kubectl apply -f - <}} + +## Delete Schema Registry Validator Kafka to PubSub +{{< details "Deletion Script" >}} + +``` +#!/bin/bash + +if [ $# -ne 2 ]; then + echo "please specify all required variables" + exit 1 +fi + +namespace=$1 +message_type=$2 + + +supported_message_types=("json", "avro", "protobuf", "xml", "csv") +if echo "${supported_message_types[@]}" | grep -qw "$message_type"; then + echo "supported message type" +else + echo "unsupported message type" + exit 1 +fi + +delete_xml_validator () { + kubectl delete -f - <}} + +## Schema Registry Validator PubSub +{{< details "Deployment Script" >}} + +``` +#!/bin/bash + +if [ $# -ne 7 ]; then + echo "please specify all required variables" + exit 1 +fi + +namespace=$1 +producer_valid_topic_ID=$2 +producer_deadletter_topic_ID=$3 +message_type=$4 +consumer_project_ID=$5 +consumer_subscription_ID=$6 +producer_project_ID=$7 +path_to_key_file=$8 + +kubectl create secret generic service-account-credentials -n dataphos --from-file=key.json=$path_to_key_file + +# shellcheck disable=SC2054 +supported_message_types=("json", "avro", "protobuf", "xml", "csv") +if echo "${supported_message_types[@]}" | grep -qw "$message_type"; then + echo "supported message type" +else + echo "unsupported message type" + exit 1 +fi + +deploy_xml_validator () { + kubectl apply -f - <}} + +## Delete Schema Registry Validator PubSub +{{< details "Deletion Script" >}} + +``` +#!/bin/bash + +if [ $# -ne 2 ]; then + echo "please specify all required variables" + exit 1 +fi + +namespace=$1 +message_type=$2 + +# shellcheck disable=SC2054 +supported_message_types=("json", "avro", "protobuf", "xml", "csv") +if echo "${supported_message_types[@]}" | grep -qw "$message_type"; then + echo "supported message type" +else + echo "unsupported message type" + exit 1 +fi + +delete_xml_validator () { + kubectl delete -f - <}} + +## Schema Registry Validator ServiceBus +{{< details "Deployment Script" >}} + +``` +#!/bin/bash + +if [ $# -ne 8 ]; then + echo "please specify all required variables" + exit 1 +fi + +namespace=$1 +producer_valid_topic_ID=$2 +producer_deadletter_topic_ID=$3 +message_type=$4 +consumer_servicebus_connection_string=$5 +consumer_servicebus_topic=$6 +consumer_servicebus_subscription=$7 +producer_servicebus_connection_string=$8 + +supported_message_types=("json", "avro", "protobuf", "xml", "csv") +if echo "${supported_message_types[@]}" | grep -qw "$message_type"; then + echo "supported message type" +else + echo "unsupported message type" + exit 1 +fi + +deploy_xml_validator () { + kubectl apply -f - <}} + +## Delete Schema Registry Validator ServiceBus +{{< details "Deletion Script" >}} + +``` +#!/bin/bash + +if [ $# -ne 2 ]; then + echo "please specify all required variables" + exit 1 +fi + +namespace=$1 +message_type=$2 + +supported_message_types=("json", "avro", "protobuf", "xml", "csv") +if echo "${supported_message_types[@]}" | grep -qw "$message_type"; then + echo "supported message type" +else + echo "unsupported message type" + exit 1 +fi + +delete_xml_validator () { + kubectl delete -f - <}} diff --git a/dataphos-docs/content/schema_registry/_index.md b/dataphos-docs/content/schema_registry/_index.md new file mode 100644 index 0000000..2d3547c --- /dev/null +++ b/dataphos-docs/content/schema_registry/_index.md @@ -0,0 +1,9 @@ +--- +title: "Schema Registry" +draft: false +weight: 1 +--- + +The Schema Registry is a one-stop solution for registering schemas for serialized data running through your streaming infrastructure and validating them in-flight without changing a single line of existing code. + +{{< toc-tree >}} \ No newline at end of file diff --git a/dataphos-docs/content/schema_registry/configuration/_index.md b/dataphos-docs/content/schema_registry/configuration/_index.md new file mode 100644 index 0000000..3d048eb --- /dev/null +++ b/dataphos-docs/content/schema_registry/configuration/_index.md @@ -0,0 +1,26 @@ +--- +title: "Deployment Customization" +draft: false +weight: 4 +geekdocCollapseSection: true +--- + + +This page describes the Schema Registry architecture. Whereas [Quickstart](/schema_registry/quickstart) will get you started fairly quickly, this page will explain more precisely the individual components being deployed, how they interact and how to configure them. The following pages go into further detail on how to customize your Kubernetes deployments: +{{< toc-tree >}} + +# Schema Registry Architecture + +The following diagram gives an overview of the individual Schema Registry components and how they interact with your underlying Kubernetes environment: + +![Architecture](/sr_architecture.png) + +When deploying the Schema Registry, you deploy the following components: + +* The **Postgres History Database** that will be used to store the schemas submitted to the Schema Registry. +* The **Schema Registry** REST server is used by users and validators to submit and pull schemas respectively. +* The **Schema Compatibility Checker** is a utility used by the Schema Registry server to ensure new schemas follow the designated compatibility mode. +* The **Schema Validity Checker** is a utility used by the Schema Registry server to ensure new schemas are valid to begin with. +* The **Validator** component is the specific component you attach to a message broker topic for validation purposes. +* **(Optionally used if validating XML data)** The **XML Validator** is a utility, decoupled validator used for validating XML schemas. +* **(Optionally used if validating CSV data)** The **CSV Validator** is a utility, decoupled validator used for validating CSV schemas. diff --git a/dataphos-docs/content/schema_registry/configuration/helm.md b/dataphos-docs/content/schema_registry/configuration/helm.md new file mode 100644 index 0000000..3ce1e4b --- /dev/null +++ b/dataphos-docs/content/schema_registry/configuration/helm.md @@ -0,0 +1,85 @@ +--- +title: "Helm" +draft: false +weight: 2 +--- + +# Dataphos Schema Registry + +The Helm Chart for the Dataphos Schema Registry component. + +## Configuration {#reference_schema_registry} + +Below is the list of configurable options in the `values.yaml` file. + +| Variable | Type | Description | Default | +|-----------------------------|---------|---------------------------------------------------------------------------------|-----------------------------------------------------------| +| namespace | string | The namespace to deploy the Schema Registry into. | `dataphos` | +| images | object | Docker images to use for each of the individual Schema Registry sub-components. | | +| images.initdb | string | Initdb Docker image. | `syntioinc/dataphos-schema-registry-initdb:1.0.0` | +| images.registry | string | The Schema Registry image. | `syntioinc/dataphos-schema-registry-api:1.0.0` | +| images.compatibilityChecker | string | The compatibility checker image. | `syntioinc/dataphos-schema-registry-compatibility:1.0.0` | +| images.validityChecker | string | Validity Checker image. | `syntioinc/dataphos-schema-registry-validity:1.0.0` | +| registryReplicas | integer | The number of replicas of the Schema Registry service. | `1` | +| registrySvcName | string | The name of the Schema Registry service. | `schema-registry-svc` | +| database | object | The Schema History database configuration object. | | +| database.name | string | History database name. | `postgres` | +| database.username | string | History database username. | `postgres` | +| database.password | string | History database password. | `POSTGRES_PASSWORD` | + + +# Dataphos Schema Validator {#reference_schema_validator} + +The Helm Chart for the Dataphos Validator component. + +## Configuration + +Below is the list of configurable options in the `values.yaml` file. + +| Variable | Type | Description | Default | +|-----------------------|---------|---------------------------------------------------------------------------------|--------------------------------------------------------| +| namespace | string | The namespace to deploy the Schema Registry into. | `dataphos` | +| images | object | Docker images to use for each of the individual Schema Registry sub-components. | | +| images.validator | string | The Validator image. | `syntioinc/dataphos-schema-registry-validator:1.0.0` | +| images.xmlValidator | string | The XML Validator image. | `syntioinc/dataphos-schema-registry-xml-val:1.0.0` | +| images.csvValidator | string | The CSV validator image. | `syntioinc/dataphos-schema-registry-csv-val:1.0.0` | +| xmlValidator | object | The XML Validator configuration. | | +| xmlValidator.enable | boolean | Determines whether the XML validator should be enabled. | `true` | +| xmlValidator.replicas | integer | The number of XML Validator replicas. | `1` | +| csvValidator | object | The CSV Validator configuration. | | +| csvValidator.enable | boolean | Determines whether the CSV validator should be enabled. | `true` | +| csvValidator.replicas | integer | The number of CSV Validator replicas. | `1` | +| schemaRegistryURL | string | The link to the Schema Registry component. | `http://schema-registry-svc:8080` | + +### Broker Configuration + +The `values.yaml` file contains a `brokers` object used to set up the key references to be used by the validators to +connect to one or more brokers deemed as part of the overall platform infrastructure. + +| Variable | Type | Description | Applicable If | +|------------------------------------|--------|-----------------------------------------------------------------------------------------------------------------|------------------------| +| brokers | object | The object containing the general information on the brokers the validator service will want to associate with. | | +| brokers.BROKER_ID | object | The object representing an individual broker's configuration. | | +| brokers.BROKER_ID.type | string | Denotes the broker's type. | | +| brokers.BROKER_ID.connectionString | string | The Azure Service Bus Namespace connection string. | `type` == `servicebus` | +| brokers.BROKER_ID.projectID | string | The GCP project ID. | `type` == `pubsub` | +| brokers.BROKER_ID.brokerAddr | string | The Kafka bootstrap server address. | `type` == `kafka` | + +### Validator Configuration {#reference_validator} + +The `values.yaml` file contains a `validator` object used to configure one or more validators to be deployed as part of +the release, explicitly referencing brokers defined in the previous section. + +| Variable | Type | Description | Applicable If | +|---------------------------------------|--------|---------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------| +| validator | object | The object containing the information on all of the validators to be deployed as part of the Helm installation. | | +| validator.VAL_ID | object | The object representing the individual validator's configuration. | | +| validator.VAL_ID.broker | string | Reference to the broker messages are pulled FROM. | | +| validator.VAL_ID.destinationBroker | string | Reference to the broker messages are sent TO. | | +| validator.VAL_ID.topic | string | The topic the messages are pulled FROM. | | +| validator.VAL_ID.consumerID | string | The consumer identifier (subscription, consumer group, etc). | | +| validator.VAL_ID.validTopic | string | The topic VALID messages are sent TO. | | +| validator.VAL_ID.deadletterTopic | string | The topic INVALID messages are sent TO. | | +| validator.VAL_ID.replicas | string | The number of replicas of a given validator instance to pull/process messages simultaneously. | | +| validator.VAL_ID.serviceAccountSecret | string | The reference to a secret that contains a `key.json` key and the contents of a Google Service Account JSON file as its contents. | `brokers.BROKER_ID.type` == `pubsub` | +| validator.VAL_ID.serviceAccountKey | string | A Google Service Account private key in JSON format, base64 encoded. Used to create a new `serviceAccountSecret` secret, if provided. | `brokers.BROKER_ID.type` == `pubsub` | \ No newline at end of file diff --git a/dataphos-docs/content/schema_registry/configuration/pulumi.md b/dataphos-docs/content/schema_registry/configuration/pulumi.md new file mode 100644 index 0000000..4975776 --- /dev/null +++ b/dataphos-docs/content/schema_registry/configuration/pulumi.md @@ -0,0 +1,194 @@ +--- +title: "Pulumi" +draft: false +weight: 3 +--- + +## ⚙️ Configuration + +There are three possible sources of resource configuration values: user configuration in the active stack configuration file, retrieved data from existing resources, and default system-level configuration from the application code. + +User configuration will always take precedence over other configuration sources. If there is no special user configuration for a parameter, the retrieved value from the resource’s previous configuration will be used. If there wasn’t any data retrieved for the resource (as it is being created for the first time), the default system-level configuration value will be used instead. The default values for parameters are listed in the appropriate section of the configuration options. + +If the configuration references an existing cloud resource, the program will retrieve its data from the cloud provider and import the resource into the active stack instead of creating a new one. If the user configuration values specify any additional parameters that differ from the resource configuration while it has not yet been imported into the stack, the deployment will fail. To modify an existing resource’s configuration, import it into the stack first and then redeploy the infrastructure with the desired changes. + +**Note:** Implicit import of an AKS cluster is currently not supported. To use an existing AKS cluster in your infrastructure, set the AKS cluster's `import` configuration option to `true`. + +⚠️ **WARNING** ⚠️ + +Imported resources will **NOT** be retained by default when the infrastructure is destroyed. If you want to retain a resource when the infrastructure is destroyed, you need to explicitly set its `retain` flag to `true` in the active stack's configuration file. Retained resources will not be deleted from the backing cloud provider, but will be removed from the Pulumi state on a `pulumi destroy`. +Azure resource groups and GCP projects are set to be retained by default and can be deleted manually. Be careful if you choose not to retain them, as destroying them will remove **ALL** children resources, even the ones created externally. It is recommended to modify these options only if you are using a dedicated empty project/resource group. + +### Global configuration options + +| Variable | Type | Description | Default value | +|--------------------------|---------|----------------------------------------------------------------------------------------------------------------|---------------| +| `namespace` | string | The name of the Kubernetes namespace where Dataphos Helm charts will be deployed to. | `dataphos` | +| `deploySchemaRegistry` | boolean | Whether the Schema Registry Helm chart should be deployed. | `false` | +| `deploySchemaValidators` | boolean | Whether the Schema Validator Helm chart should be deployed. | `false` | +| `retainResourceGroups` | boolean | Whether Azure resource groups should be retained when the infrastructure is destroyed. | `true` | +| `retainProjects` | boolean | Whether GCP projects should be retained when the infrastructure is destroyed. | `true` | +| `resourceTags` | object | Set of `key:value` tags attached to all Azure resource groups; or set of labels attached to all GCP resources. | | + +### Product configuration options + +The `namespace` and `images` options at the top-level of the Helm chart configurations are set by default and do not need to be manually configured. + +Cloud-specific variables should not be manually configured. Depending on the configured cloud provider, service accounts with appropriate roles are automatically created and their credentials are used to populate these variables. + +| Variable | Type | Description | +|---------------------------------------|--------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `dataphos-schema-registry` | object | Dataphos Schema Registry Helm chart configuration. Configuration options are listed in the [schema registry configuration]({{< ref "helm#reference_schema_registry">}}). | +| `dataphos-schema-validator` | object | Dataphos Schema Validator Helm chart configuration. Configuration options are listed in the [schema validator configuration]({{< ref "helm#reference_schema_validator">}}). | +| `dataphos-schema-validator.validator` | object | The object containing the information on all of the validators to be deployed. Configuration options are listed in the [validator configuration]({{< ref "helm#reference_validator">}}). | + + + +## Provider configuration options +The variables listed here are required configuration options by their respective Pulumi providers. Your entire infrastructure should reside on a single cloud platform. Deployment across multiple cloud platforms is currently not fully supported. + +{{< tabs "Provider configuration options" >}} + +{{< tab "Azure" >}} +| Variable | Type | Description | Example value | +|-------------------------|--------|------------------------------------|---------------| +| `azure-native:location` | string | The default resource geo-location. | `westeurope` | + +A list of all configuration options for this provider can be found here: +[Azure Native configuration options](https://www.pulumi.com/registry/packages/azure-native/installation-configuration/#configuration-options). + +{{}} + + +{{< tab "GCP" >}} +To successfully deploy resources in a GCP project, the appropriate APIs need to be enabled for that project in the API Console. See: [Enable and disable APIs](https://support.google.com/googleapi/answer/6158841). + +| Variable | Type | Description | Example value | +|---------------|--------|--------------------------|-------------------| +| `gcp:project` | string | The default GCP project. | `syntio-dataphos` | +| `gcp:region` | string | The default region.. | `europe-west2` | +| `gcp:zone` | string | The default zone. | `europe-west2-a` | + +A list of all configuration options for this provider can be found here: +[GCP configuration options](https://www.pulumi.com/registry/packages/gcp/installation-configuration/#configuration-reference). + +{{}} +{{}} + +## Cluster configuration options + +The stack configuration `cluster` object is utilized to configure the Kubernetes cluster necessary to deploy the Helm charts that comprise Dataphos products. + +### Common cluster configuration + +| Variable | Type | Description | +|-----------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `cluster` | object | The object containing the general information on the cluster. | +| `cluster.CLUSTER_ID` | object | The object representing an individual cluster's configuration. | +| `cluster.CLUSTER_ID.type` | string | The type of the managed cluster. Valid values: [`gke`, `aks`]. | +| `cluster.CLUSTER_ID.name` | string | The name of the managed cluster. | +| `cluster.CLUSTER_ID.retain` | boolean | If set to true, resource will be retained when infrastructure is destroyed. Retained resources will not be deleted from the backing cloud provider, but will be removed from the Pulumi state. | + +### Specific cluster configuration + +{{< tabs "Cluster configuration options" >}} + +{{< tab "AKS" >}} +| Variable | Type | Description | Default value | +|----------------------------------------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------| +| `cluster.CLUSTER_ID.import` | boolean | Whether to use an existing AKS cluster instead of creating a new one.
**Note:** AKS clusters imported in this way will be retained on destroy, unless its resource group is not retained. | `false` | +| `cluster.CLUSTER_ID.resourceGroup` | string | The name of the resource group. The name is case insensitive. | | +| `cluster.CLUSTER_ID.sku` | object | The managed cluster SKU. | | +| `cluster.CLUSTER_ID.sku.name` | string | The managed cluster SKU name. | `Basic` | +| `cluster.CLUSTER_ID.sku.tier` | string | The managed cluster SKU tier. | `Free` | +| `cluster.CLUSTER_ID.dnsPrefix` | string | The cluster DNS prefix. This cannot be updated once the Managed Cluster has been created. | | +| `cluster.CLUSTER_ID.agentPoolProfiles` | object | The agent pool properties. | | +| `cluster.CLUSTER_ID.agentPoolProfiles.name` | string | Windows agent pool names must be 6 characters or less. | | +| `cluster.CLUSTER_ID.agentPoolProfiles.count` | integer | Number of agents (VMs) to host docker containers. | `3` | +| `cluster.CLUSTER_ID.agentPoolProfiles.enableAutoScaling` | boolean | Whether to enable auto-scaler. | `false` | +| `cluster.CLUSTER_ID.agentPoolProfiles.minCount` | integer | The minimum number of nodes for auto-scaling. | `1` | +| `cluster.CLUSTER_ID.agentPoolProfiles.maxCount` | integer | The maximum number of nodes for auto-scaling. | `5` | +| `cluster.CLUSTER_ID.agentPoolProfiles.vmSize` | string | VM size availability varies by region. See: [Supported VM sizes](https://docs.microsoft.com/azure/aks/quotas-skus-regions#supported-vm-sizes) | `Standard_DS2_v2` | +| `cluster.CLUSTER_ID.tags` | object | Set of `key:value` tags attached to the AKS Cluster. This will override the global `resourceTags` configuration option for this resource. | | + + +{{}} + +{{< tab "GKE" >}} + +| Variable | Type | Description | Default value | +|----------------------------------------------------------------|-------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `cluster.CLUSTER_ID.projectID` | string | The project ID is a unique identifier for a GCP project. | | +| `cluster.CLUSTER_ID.location` | string | The geo-location where the resource lives. | | +| `cluster.CLUSTER_ID.initialNodeCount` | integer | The number of nodes to create in this cluster's default node pool. | `3` | +| `cluster.CLUSTER_ID.nodeConfigs` | object | Parameters used in creating the default node pool. | | +| `cluster.CLUSTER_ID.nodeConfig.machineType` | string | The name of a Google Compute Engine machine type. | `e2-medium` | +| `cluster.CLUSTER_ID.clusterAutoscalings` | object list | Per-cluster configuration of Node Auto-Provisioning with Cluster Autoscaler to automatically adjust the size of the cluster and create/delete node pools based on the current needs of the cluster's workload. | | +| `cluster.CLUSTER_ID.clusterAutoscalings[0].autoscalingProfile` | string | Lets you choose whether the cluster autoscaler should optimize for resource utilization or resource availability when deciding to remove nodes from a cluster. Valid values: [`BALANCED`, `OPTIMIZE_UTILIZATION`]. | `BALANCED` | +| `cluster.CLUSTER_ID.clusterAutoscalings[0].enabled` | boolean | Whether node auto-provisioning is enabled. | `false` | +| `cluster.CLUSTER_ID.clusterAutoscalings[0].resourceLimits` | object list | Global constraints for machine resources in the cluster. Configuring the cpu and memory types is required if node auto-provisioning is enabled. | resourceLimits:
- resource_type: cpu
  minimum: 1
  maximum: 1
- resource_type: memory
  minimum: 1
  maximum: 1 | +| `cluster.CLUSTER_ID.resourceLabels` | object | Set of `key:value` labels attached to the GKE Cluster. This will override the global `resourceTags` configuration option for this resource. | | + +{{}} +{{}} + +## Broker configuration options +The stack configuration `brokers` object is used to set up the key references to be used by the dataphos components to connect to one or more brokers deemed to be part of the overall platform infrastructure. + +Product configs directly reference brokers by their `BROKER_ID` listed in the broker config. The same applies to `TOPIC_ID` and `SUB_ID` – the keys of those objects are the actual names of the topics and subscriptions used. + +### Common broker configuration + +| Variable | Type | Description | +|--------------------------------------------------------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `brokers` | object | The object containing the general information on the brokers. | +| `brokers.BROKER_ID` | object | The object representing an individual broker's configuration. | +| `brokers.BROKER_ID.type` | string | Denotes the broker's type. Valid values: [`kafka`, `pubsub`, `servicebus`]. | +| `brokers.BROKER_ID.topics` | object | The object containing the general information on the topics. | +| `brokers.BROKER_ID.topics.TOPIC_ID` | object | The object representing an individual topic's configuration. | +| `brokers.BROKER_ID.topics.TOPIC_ID.retain` | boolean | If set to true, resource will be retained when infrastructure is destroyed. Retained resources will not be deleted from the backing cloud provider, but will be removed from the Pulumi state. | +| `brokers.BROKER_ID.topics.TOPIC_ID.subscriptions` | object | The object containing topic subscription (consumer group) configuration. | +| `brokers.BROKER_ID.topics.TOPIC_ID.subscriptions.SUBSCRIPTION_ID` | object | The object representing an individual topic subscription's configuration. | +| `brokers.BROKER_ID.topics.TOPIC_ID.subscriptions.SUBSCRIPTION_ID.retain` | boolean | If set to true, resource will be retained when infrastructure is destroyed. Retained resources will not be deleted from the backing cloud provider, but will be removed from the Pulumi state. | + +The Azure storage account type. Valid values: [`Storage`, `StorageV2`, `BlobStorage`, `BlockBlobStorage`, `FileStorage`]. The default and recommended value is `BlockBlobStorage`. + +### Specific broker configuration + +{{< tabs Broker configuration options >}} +{{< tab "Azure Service Bus" >}} +| Variable | Type | Description | +|-----------------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `brokers.BROKER_ID.azsbNamespace` | string | The Azure Service Bus namespace name. | +| `brokers.BROKER_ID.resourceGroup` | string | The Azure Service Bus resource group name. | +| `brokers.BROKER_ID.sku` | object | The Azure Service Bus namespace SKU properties. | +| `brokers.BROKER_ID.sku.name` | string | Name of this SKU. Valid values: [`BASIC`, `STANDARD`, `PREMIUM`]. Default value is `STANDARD`. | +| `brokers.BROKER_ID.sku.tier` | string | The billing tier of this SKU. [`BASIC`, `STANDARD`, `PREMIUM`]. Default value is `STANDARD`. | +| `brokers.BROKER_ID.sku.capacity` | integer | The specified messaging units for the tier. For Premium tier, valid capacities are 1, 2 and 4. | +| `brokers.BROKER_ID.tags` | object | Set of `key:value` tags attached to the Azure Service Bus namespace. This will override the global `resourceTags` configuration option for this resource. | +| `brokers.BROKER_ID.retain` | boolean | If set to true, the Azure Service Bus namespace will be retained when infrastructure is destroyed. Retained resources will not be deleted from the backing cloud provider, but will be removed from the Pulumi state. | + +{{}} + + +{{< tab "Google Cloud Pub/Sub" >}} +| Variable | Type | Description | +|--------------------------------------------------------------------------|--------|------------------------------------------------------------------------------------------------------------------------------------------------------| +| `brokers.BROKER_ID.projectID` | string | The GCP project ID. | +| `brokers.BROKER_ID.topics.TOPIC_ID.labels` | object | Set of `key:value` labels attached to the Pub/Sub topic. This will override the global `resourceTags` configuration option for this resource. | +| `brokers.BROKER_ID.topics.TOPIC_ID.subscriptions.SUBSCRIPTION_ID.labels` | object | Set of `key:value` labels attached to the Pub/Sub subscription. This will override the global `resourceTags` configuration option for this resource. | + +{{}} + +{{< tab "Kafka" >}} +| Variable | Type | Description | Default value | +|------------------------------------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------| +| `brokers.BROKER_ID.brokerAddr` | string | The Kafka bootstrap server address. Optional. If omitted or empty, a new Strimzi Kafka cluster operator and cluster will be deployed with default settings. | | +| `brokers.BROKER_ID.clusterName` | string | The name of the Strimzi Kafka cluster custom Kubernetes resource. | `kafka-cluster` | +| `brokers.BROKER_ID.clusterNamespace` | string | The Kubernetes namespace where the cluster will be deployed. | `kafka-cluster` | +| `brokers.BROKER_ID.topics.TOPIC_ID.partitions` | integer | Number of partitions for a specific topic. | `3` | +| `brokers.BROKER_ID.topics.TOPIC_ID.replicas` | integer | Number of replicas for a specific topic. | `1` | + +{{}} +{{}} + diff --git a/dataphos-docs/content/schema_registry/configuration/shell.md b/dataphos-docs/content/schema_registry/configuration/shell.md new file mode 100644 index 0000000..372decd --- /dev/null +++ b/dataphos-docs/content/schema_registry/configuration/shell.md @@ -0,0 +1,203 @@ +--- +title: "Shell" +draft: false +weight: 1 +--- + +# Schema Registry Configuration + +The **Schema Registry server** may be deployed following the same steps as outlined on the [quickstart](/schema_registry/quickstart) page. There are no additional configuration options required beyond deploying the microservices and configuring the credentials of the schema history database. + +## Validator Configuration + +The **Validator** component of the Schema Registry is a single Docker image configured through a set of environment variables. The tables below cover the core configuration variables, outlining how the different combinations of producers and consumers can be configured. + +The tables are organized by the message broker technology, listing which variables need to be set when pulling data from the given broker type and when publishing data to the given broker type. If, for instance, you wish to pull and validate data from **Kafka** and publish the results to **GCP Pub/Sub**, you would read the **Consumer** section from the **Kafka** tab, and then the **Producer** section of the **GCP Pub/Sub** tab. + +> **NOTE**: Values with the "*" sign in the following tables are required and need to be set! + +{{< tabs "Configuration" >}} +{{< tab "Common Configuration" >}} + +## Common configuration + +| Environment variable name | Description | Type | Default | +|:--------------------------------------------------:|-------------|:-------------:|:------------:| +| CONSUMER_TYPE* |Type of broker to consume messages from.| string | | +| PRODUCER_TYPE* |Type of broker to deliver valid and invalid messages to. | string | | +| TOPICS_VALID* |Topic or ID where valid messages are published to.| string | | +| TOPICS_DEAD_LETTER* |Topic or ID where invalid messages are published to.| string | | +| REGISTRY_URL* |Address of the Schema Registry (If deployed in the same namespace as the **Registry** component it can stay http://schema-registry-svc.com as the local DNS name of the service)| string | http://schema-registry-svc.com | +| REGISTRY_GET_TIMEOUT |Interval to wait for the fetch request response.| time.Duration | 4s | +| REGISTRY_REGISTER_TIMEOUT |Interval to wait for the register request response.| time.Duration | 10s | +| REGISTRY_UPDATE_TIMEOUT |Interval to wait for the update request response.| time.Duration | 10s | +| REGISTRY_INMEM_CACHE_SIZE |Cache size for the fetched schemas.| int | 100 | +| VALIDATORS_ENABLE_CSV |Enable CSV validation.| bool | false | +| VALIDATORS_ENABLE_JSON |Enable JSON validation.| bool | false | +| VALIDATORS_ENABLE_AVRO |Enable avro validation.| bool | false | +| VALIDATORS_ENABLE_PROTOBUF |Enable protobuf validation.| bool | false | +| VALIDATORS_ENABLE_XML |Enable XML validation.| bool | false | +| VALIDATORS_CSV_URL |Address of the CSV validator.| string | http://csv-validator-svc.com | +| VALIDATORS_CSV_TIMEOUT_BASE |Interval to wait for connecting to the CSV validator.| time.Duration | 2s | +| VALIDATORS_JSON_USE_ALT_BACKEND |Use another library for validation (gojsonschema instead of jsonschema).| bool | false | +| VALIDATORS_JSON_CACHE_SIZE |Size of the JSON validator cache.| int | 100 | +| VALIDATORS_PROTOBUF_FILE_PATH |File path to the .proto file.| string | “./.schemas” | +| VALIDATORS_PROTOBUF_CACHE_SIZE |Protobuf validator cache size.| int | 100 | +| VALIDATORS_XML_URL |Address of the XML validator.| string | http://csv-validator-svc.com | +| VALIDATORS_XML_TIMEOUT_BASE |Interval to wait for connecting to the XML validator.| time.Duration | 3s | + +{{< /tab >}} +{{< tab "Additional Configuration" >}} + +## Additional Configuration + +| Environment variable name | Description | Type | Default | +|:--------------------------------------------------:|-------------|:-------------:|:------------:| +| SHOULD_LOG_MISSING_SCHEMA |Log a warning if a message is missing a schema.| bool | false | +| SHOULD_LOG_VALID |Log an information if a message is classified as valid.| bool | false | +| SHOULD_LOG_DEAD_LETTER |Log an error if a message is classified as deadletter.| bool | false | +| RUN_OPTIONS_ERR_THRESHOLD |The acceptable amount of unrecoverable message processing errors per RUN_OPTIONS_ERR_INTERVAL. If the threshold is reached, a run is preemptively canceled. A non-positive value is ignored.| int64 | 50 | +| RUN_OPTIONS_ERR_INTERVAL |The time interval used to reset the RUN_OPTIONS_ERR_THRESHOLD counter. If no change to the counter is observed in this interval, the counter is reset, as it's safe to assume the system has managed to recover from the erroneous behavior. Only used if RUN_OPTIONS_ERR_THRESHOLD is a positive integer.| time.Duration | 1m | +| RUN_OPTIONS_NUM_RETRIES |Determines the number of times the executor will repeat erroneous calls to the handler. Keep in mind this may result in duplicates if the messaging system re-sends messages on acknowledgment timeout. Setting this option will lead record-based executors to stop polling for new messages until the ones which are currently being retry-ed are either successful or the number of retries exceeds NumRetries.| int | 0 | +| NUM_SCHEMA_COLLECTORS |Defines the maximum amount of inflight requests to the schema registry.| int | -1 | +| NUM_INFERRERS |Defines the maximum amount of inflight destination topic inference jobs (validation and routing).| int | -1 | +| METRICS_LOGGING_INTERVAL |Defines how often the metrics are going to be logged.| time.Duration | 5s | + +{{< /tab >}} + +{{< tab "Kafka" >}} + +## Kafka Consumer + +| Environment variable name | Description | Type | Default | +|:--------------------------------------------------:|-------------|:-------------:|:------------:| +| CONSUMER_KAFKA_ADDRESS* |Comma-separated list of at least one broker which is a member of the target cluster.| string | | +| CONSUMER_KAFKA_TOPIC* |Name of the topic from which the Validator component will consume the messages.| string | | +| CONSUMER_KAFKA_GROUP_ID* |Determines which consumer group the consumer belongs to.| string | | +| CONSUMER_KAFKA_
TLS_CONFIG_CLIENT_KEY_FILE |Path to the client TLS key file.| string | | +| CONSUMER_KAFKA_
TLS_CONFIG_CA_CERT_FILE |Path to the CA TLS certificate file.| string | | +| CONSUMER_KAFKA_SETTINGS_
MAX_BYTES |The maximum amount of bytes Kafka will return whenever the consumer polls a broker. It is used to limit the size of memory that the consumer will use to store data that was returned from the server, irrespective of how many partitions or messages were returned. | int | 10485760 | +| CONSUMER_KAFKA_SETTINGS_
MAX_CONCURRENT_FETCHES |The maximum number of fetch requests to allow in flight or buffered at once. This setting, paired with CONSUMER_KAFKA_SETTINGS_MAX_BYTES, can upper bound the maximum amount of memory that the client can use for consuming. Requests are issued to brokers in a FIFO order: once the client is ready to issue a request to a broker, it registers that request and issues it in order with other registrations.
A value of 0 implies the allowed concurrency is unbounded and will be limited only by the number of brokers in the cluster.| int | 3 | +| CONSUMER_KAFKA_SETTINGS_
MAX_POLL_RECORDS |The maximum number of records that a single call to poll() will return. Use this to control the amount of data (but not the size of data) your application will need to process in one iteration. Keep in mind that this is only the maximum number of records; there's no guarantee the BatchIterator will return CONSUMER_KAFKA_SETTINGS_MAX_POLL_RECORDS even if the state of the topic the iterator consumes from allows it.| int | 100 | + +## Kafka Producer + +| Environment variable name | Description | Type | Default | +|:--------------------------------------------------:|-------------|:-------------:|:------------:| +| PRODUCER_KAFKA_ADDRESS* |Comma-separated list of at least one broker which is a member of the target cluster.| string | | +| PRODUCER_KAFKA_
TLS_CONFIG_ENABLED |Address of the Kafka producer server.| bool | false | +| PRODUCER_KAFKA_
TLS_CONFIG_CLIENT_CERT_FILE** |Path to the client TLS certificate file.| string | | +| PRODUCER_KAFKA_
TLS_CONFIG_CLIENT_KEY_FILE** |Path to the client TLS key file.| string | | +| PRODUCER_KAFKA_
TLS_CONFIG_CA_CERT_FILE** |Path to the CA TLS certificate file.| string | | +| PRODUCER_KAFKA_SETTINGS_
BATCH_SIZE |The max amount of records the client will buffer, blocking new produces until records are finished if this limit is reached.| int | 40 | +| PRODUCER_KAFKA_SETTINGS_
BATCH_BYTES |When multiple records are sent to the same partition, the producer will batch them together. This parameter controls the amount of memory in bytes that will be used for each batch.

This does not mean that the producer will wait for the batch to become full. The producer will send half-full batches and even batches with just a single message in them. Therefore, setting the batch size too large will not cause delays in sending messages; it will just use more memory for the batches.| int64 | 5242880 | +| PRODUCER_KAFKA_SETTINGS_
LINGER |The amount of time to wait for additional messages before sending the current batch. The producer sends a batch of messages either when the current batch is full or when the Linger limit is reached, whatever comes first. This variable is specific to a topic partition. A high volume producer will likely be producing to many partitions; it is both unnecessary to linger in this case and inefficient because the client will have many timers running (and stopping and restarting) unnecessarily.| time.Duration | 10ms | + +{{< /tab >}} + +{{< tab "Event Hubs" >}} + +## EventHubs Consumer + +| Environment variable name | Description | Type | Default | +|:--------------------------------------------------:|-------------|:-------------:|:------------:| +| CONSUMER_EVENTHUBS_ADDRESS* |Address of the Event Hubs server.| string | | +| CONSUMER_EVENTHUBS_TOPIC* |Name of the topic from which the Validator component will consume the messages.| string | | +| CONSUMER_EVENTHUBS_GROUP_ID* |Determines which group the consumer belongs to.| string | | +| CONSUMER_EVENTHUBS_
TLS_CONFIG_CLIENT_KEY_FILE |Path to the client TLS key file.| string | | +| CONSUMER_EVENTHUBS_
TLS_CONFIG_CA_CERT_FILE |Path to the CA TLS certificate file.| string | | +| CONSUMER_EVENTHUBS_
SASL_CONFIG_USER* |SASL username.| string | | +| CONSUMER_EVENTHUBS_
SASL_CONFIG_PASSWORD* |SASL password.| string | | +| CONSUMER_EVENTHUBS_SETTINGS_
MAX_BYTES |The maximum amount of bytes Kafka will return whenever the consumer polls a broker. It is used to limit the size of memory that the consumer will use to store data that was returned from the server, irrespective of how many partitions or messages were returned.| int | 10485760 | +| CONSUMER_EVENTHUBS_SETTINGS_
MAX_CONCURRENT_FETCHES |The maximum number of fetch requests to allow in flight or buffered at once. This setting, paired with CONSUMER_KAFKA_SETTINGS_MAX_BYTES, can upper bound the maximum amount of memory that the client can use for consuming. Requests are issued to brokers in a FIFO order: once the client is ready to issue a request to a broker, it registers that request and issues it in order with other registrations.
A value of 0 implies the allowed concurrency is unbounded and will be limited only by the number of brokers in the cluster.| int | 3 | +| CONSUMER_EVENTHUBS_SETTINGS_
MAX_POLL_RECORDS |The maximum number of records that a single call to poll() will return. Use this to control the amount of data (but not the size of data) your application will need to process in one iteration. Keep in mind that this is only the maximum number of records; there's no guarantee the BatchIterator will return CONSUMER_KAFKA_SETTINGS_MAX_POLL_RECORDS even if the state of the topic the iterator consumes from allows it.| int | 100 | + +## EventHubs Producer + +| Environment variable name | Description | Type | Default | +|:--------------------------------------------------:|-------------|:-------------:|:------------:| +| PRODUCER_EVENTHUBS_ADDRESS* |Address of the Event Hubs producer server.| string | | +|
|
|
|
| +| PRODUCER_EVENTHUBS_
TLS_CONFIG_CLIENT_KEY_FILE |Path to the client TLS key file.| string | | +| PRODUCER_EVENTHUBS_
TLS_CONFIG_CA_CERT_FILE |Path to the CA TLS certificate file.| string | | +| PRODUCER_EVENTHUBS_
SASL_CONFIG_USER* |SASL username.| string | | +| PRODUCER_EVENTHUBS_
SASL_CONFIG_PASSWORD* |SASL password.| string | | +|
|
|
|
| +| PRODUCER_EVENTHUBS_SETTINGS_
BATCH_SIZE |The max amount of records the client will buffer, blocking new produces until records are finished if this limit is reached.| int | 40 | +| PRODUCER_EVENTHUBS_SETTINGS_
BATCH_BYTES |When multiple records are sent to the same partition, the producer will batch them together. This parameter controls the amount of memory in bytes that will be used for each batch. This does not mean that the producer will wait for the batch to become full. The producer will send half-full batches and even batches with just a single message in them. Therefore, setting the batch size too large will not cause delays in sending messages; it will just use more memory for the batches.| int64 | 5242880 | +| PRODUCER_EVENTHUBS_SETTINGS_
LINGER |The amount of time to wait for additional messages before sending the current batch. The producer sends a batch of messages either when the current batch is full or when the Linger limit is reached, whatever comes first. This variable is specific to a topic partition. A high volume producer will likely be producing to many partitions; it is both unnecessary to linger in this case and inefficient because the client will have many timers running (and stopping and restarting) unnecessarily.| time.Duration | 10ms | + +{{< /tab >}} + +{{< tab "GCP Pub/Sub" >}} + +## Pub/Sub Consumer + +| Environment variable name | Description | Type | Default | +|:--------------------------------------------------:|-------------|:-------------:|:------------:| +| CONSUMER_PUBSUB_PROJECT_ID* |ID of the GCP project where Pub/Sub consumer is running.| string | | +| CONSUMER_PUBSUB_SUBSCRIPTION_ID* |Subscription ID of the topic from which the Validator component will consume the messages. | string | | +| CONSUMER_PUBSUB_SETTINGS_
MAX_EXTENSION |The maximum period for which the Subscription should automatically extend the ack deadline for each message. The Subscription will automatically extend the ack deadline of all fetched Messages up to the duration specified. Automatic deadline extension beyond the initial receipt may be disabled by specifying a duration less than 0.| time.Duration | 30m | +| CONSUMER_PUBSUB_SETTINGS_
MAX_EXTENSION_PERIOD |The maximum duration by which to extend the ack deadline at a time. The ack deadline will continue to be extended by up to this duration until CONSUMER_PUBSUB_SETTINGS_MAX_EXTENSION is reached. Setting this variable bounds the maximum amount of time before a message redelivery in the event the subscriber fails to extend the deadline. CONSUMER_PUBSUB_SETTINGS_MAX_EXTENSION_PERIOD must be between 10s and 600s (inclusive). This configuration can be disabled by specifying a duration less than (or equal to) 0.| time.Duration | 3m | +| CONSUMER_PUBSUB_SETTINGS_
MAX_OUTSTANDING_MESSAGES |The maximum number of unprocessed messages (unacknowledged but not yet expired). If this variable is 0, default value will be taken. If the value is negative, then there will be no limit on the number of unprocessed messages.| int | 1000 | +| CONSUMER_PUBSUB_SETTINGS_
MAX_OUTSTANDING_BYTES |The maximum size of unprocessed messages (unacknowledged but not yet expired). If MaxOutstandingBytes is 0, it will be treated as if it were DefaultReceiveSettings.MaxOutstandingBytes. If the value is negative, then there will be no limit on the number of bytes for unprocessed messages.| int | 419430400 | +| CONSUMER_PUBSUB_SETTINGS_
NUM_GOROUTINES |The number of goroutines that each data structure along the Receive path will spawn.| int | 10 | + +## Pub/Sub Producer + +| Environment variable name | Description | Type | Default | +|:--------------------------------------------------:|-------------|:-------------:|:------------:| +| PRODUCER_PUBSUB_PROJECT_ID* |ID of the GCP project where Pub/Sub producer is running.| string | | +| PRODUCER_PUBSUB_SETTINGS_
DELAY_THRESHOLD |Publish a non-empty batch after this delay has passed.| time.Duration | 50ms | +| PRODUCER_PUBSUB_SETTINGS_
COUNT_THRESHOLD |Publish a batch when it has this many messages.| int | 50 | +| PRODUCER_PUBSUB_SETTINGS_
BYTE_THRESHOLD |Publish a batch when its size in bytes reaches this value.| int | 52428800 | +| PRODUCER_PUBSUB_SETTINGS_
NUM_GOROUTINES |The number of goroutines used in each of the data structures that are involved along the the Publish path. Adjusting this value adjusts concurrency along the publish path.| int | 5 | +| PRODUCER_PUBSUB_SETTINGS_
MAX_OUTSTANDING_MESSAGES |The maximum number of buffered messages to be published. If less than or equal to zero, this is disabled.| int | 800 | +| PRODUCER_PUBSUB_SETTINGS_
MAX_OUTSTANDING_BYTES |The maximum size of buffered messages to be published. If less than or equal to zero, this is disabled.| int | 1048576000 | +| PRODUCER_PUBSUB_SETTINGS_
ENABLE_MESSAGE_ORDERING |Enables delivery of ordered keys.| bool | false | + +{{< /tab >}} + +{{< tab "Service Bus" >}} + +## Service Bus Consumer + +| Environment variable name | Description | Type | Default | +|:--------------------------------------------------:|-------------|:-------------:|:------------:| +| CONSUMER_SERVICEBUS_CONNECTION_STRING* |Service Bus consumer connection string.| string | | +| CONSUMER_SERVICEBUS_TOPIC* |Name of the topic from which the Validator component will consume the messages.| string | | +| CONSUMER_SERVICEBUS_SUBSCRIPTION* |Service Bus subscription.| string | | +| CONSUMER_SERVICEBUS_SETTINGS_
BATCH_SIZE |Size of the consumer Service Bus batches.| int | 100 | +| CONSUMER_SERVICEBUS_SETTINGS_
BATCH_TIMEOUT |(MOZDA DEPRECATED??)| time.Duration | 500ms | + +## Servic eBus Producer + +| Environment variable name | Description | Type | Default | +|:--------------------------------------------------:|-------------|:-------------:|:------------:| +| PRODUCER_SERVICEBUS_
CONNECTION_STRING* |Service Bus producer connection string.| string | | + +{{< /tab >}} + +{{< tab "NATS JetStream" >}} + +## NATS JetStream Consumer + +| Environment variable name | Description | Type | Default | +|:--------------------------------------------------:|-------------|:-------------:|:------------:| +| CONSUMER_JETSTREAM_URL* |JetStream consumer url.| string | | +| CONSUMER_JETSTREAM_SUBJECT* |Subject from which the Validator component will consume the messages.| string | | +| CONSUMER_JETSTREAM_CONSUMER_NAME* |JetStream consumer name.| string | | +| CONSUMER_JETSTREAM_SETTINGS_
BATCH_SIZE |Size of the consumer JetStream batches.| int | 100 | +| CONSUMER_JETSTREAM_SETTINGS_
BATCH_TIMEOUT |(MOZDA DEPRECATED??)| time.Duration | 500ms | + +## NATS JetStream Producer + +| Environment variable name | Description | Type | Default | +|:--------------------------------------------------:|-------------|:-------------:|:------------:| +| PRODUCER_JETSTREAM_URL* |JetStream producer url.| string | | +| PRODUCER_JETSTREAM_SETTINGS_
MAX_INFLIGHT_PENDING |Specifies the maximum outstanding async publishes that can be inflight at one time.| int | 512 | + +{{< /tab >}} +{{< /tabs >}} + + diff --git a/dataphos-docs/content/schema_registry/quickstart/_index.md b/dataphos-docs/content/schema_registry/quickstart/_index.md new file mode 100644 index 0000000..bd81370 --- /dev/null +++ b/dataphos-docs/content/schema_registry/quickstart/_index.md @@ -0,0 +1,13 @@ +--- +title: "Quickstart" +draft: false +weight: 3 +geekdocCollapseSection: true +--- + +There are 3 options for deploying dataphos components, including the Schema Registry: +{{< toc-tree >}} + +The quickstart guides will get you a working Schema Registry deployment as quickly as possible. Use any of the three deployment options and follow the guide. +The [Deployment Customization](/schema_registry/configuration) contains a detailed overview of configuration parameters if you wish to customize the configuration according to your requirements. + diff --git a/dataphos-docs/content/schema_registry/quickstart/helm.md b/dataphos-docs/content/schema_registry/quickstart/helm.md new file mode 100644 index 0000000..e38f591 --- /dev/null +++ b/dataphos-docs/content/schema_registry/quickstart/helm.md @@ -0,0 +1,181 @@ +--- +title: "Helm" +draft: false +weight: 2 +--- + +## Setting up your environment + +### Prerequisites + +This quickstart guide will assume that you have [Helm](https://helm.sh/) installed and a running Kubernetes cluster on one of the major cloud providers (GCP, Azure). If you happen to be using VS Code make sure to have the Kubernetes and Helm extensions installed to make life a little easier for you. Helm repository can be accessed on the [Helm repository](https://github.com/dataphos/dataphos-helm). + +Resources that are used must be running before the deployment. +Schema Registry has multiple message broker options. This quickstart guide will assume that the publishing message +broker and the consuming message broker will be either GCP Pub/Sub, Azure ServiceBus or Kafka, and that you have +created: + +{{< tabs "platformconfig" >}} +{{< tab "GCP" >}} +- Service account JSON key with the appropriate roles (Pub/Sub Publisher, Pub/Sub Subscriber) ([Service Account Creation](https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console), [JSON Key Retrieval](https://cloud.google.com/iam/docs/keys-create-delete)) +- An input topic and subscription (The input topic refers to the topic that contains the data in its original + format) +- Valid topic and subscription (The valid topic refers to the topic where the data is stored after being validated + and serialized using a specific schema) +- Dead-letter topic and subscription (The valid topic refers to the topic where messages that could not be processed + by a consumer are stored for troubleshooting and analysis purposes) +- (optional) Prometheus server for gathering the metrics and monitoring the logs + - Can be deployed quickly using [this deployment script](https://github.com/dataphos/dataphos-docs/blob/main/scripts/prometheus.sh) + +{{< /tab >}} +{{< tab "Azure ServiceBus" >}} +- ServiceBus connection string +- An input topic and subscription (The input topic refers to the topic that contains the data in its original + format) +- Valid topic and subscription (The valid topic refers to the topic where the data is stored after being validated + and serialized using a specific schema) +- Dead-letter topic and subscription (The valid topic refers to the topic where messages that could not be processed + by a consumer are stored for troubleshooting and analysis purposes) +- (optional) Prometheus server for gathering the metrics and monitoring the logs + - Can be deployed quickly using [this deployment script](https://github.com/dataphos/dataphos-docs/blob/main/scripts/prometheus.sh) + +{{< /tab >}} +{{< tab "Kafka" >}} +- Kafka broker. You may deploy one onto your Kubernetes environment via [Strimzi](https://strimzi.io/docs/operators/0.30.0/quickstart.html) +- An input topic (The input topic refers to the topic that contains the data in its original + format) +- Valid topic (The valid topic refers to the topic where the data is stored after being validated + and serialized using a specific schema) +- Dead-letter topic (The valid topic refers to the topic where messages that could not be processed + by a consumer are stored for troubleshooting and analysis purposes) +- (optional) Prometheus server for gathering the metrics and monitoring the logs + - Can be deployed quickly using [this deployment script](https://github.com/dataphos/dataphos-docs/blob/main/scripts/prometheus.sh) + +{{< /tab >}} +{{< /tabs >}} + +### Create the Schema Registry namespace + +Before deploying the Schema Registry, the namespace where the components will be deployed should be created if it +doesn't exist. + +Open a command line tool of your choice and connect to your cluster. Create the namespace where Schema Registry will be +deployed. We will use namespace `dataphos` in this quickstart guide. + +```bash +kubectl create namespace dataphos +``` + +## Deployment +Schema registry is separated into two components: the registry component and the validators component. + +The registry component is used as a central schema management system that provides options of schema registration and versioning as well as schema validity and compatibility checking. Therefore, it is usually deployed only once. + +The validator component acts as a message validation system, meaning that it consists of validators that validate the message for the given message schema. The validator supports JSON, AVRO, ProtoBuf, XML and CSV message formats. The idea is to have multiple validator components for every topic you wish to validate the schemas for and therefore the validator component might be deployed multiple times. +## Deploy the Schema Registry - Registry Component + +### Arguments + +The required arguments are: + +- The Kubernetes namespace you will be deploying the registry to +- Schema History Postgres database password + +### Chart Usage + +Each chart has its own configuration settings outlined in its respective subfolder. A `values.yaml` file should be prepared and pass to Helm while performing the installation. Chart can be accessed on the [Helm repository](https://github.com/dataphos/dataphos-helm/tree/main/dataphos-schema-registry). + +To deploy the `dataphos-schema-registry` chart, run: + +``` +helm install schema-registry ./dataphos-schema-registry +``` + +This would cause the `values.yaml` file to be read from the root directory of the `dataphos-schema-registry` folder. The `--values` flag may be passed in the call to override this behavior. + +You can also add a `--dry-run` flag that will simply generate the Kubernetes manifests and check if they are valid (note that this requires `kubectl` to be configured against an actual cluster). For general linting of the Helm templates, run `helm lint`. + +## Deploy the Schema Registry - Validator Component + +You can deploy the **Validator** component of the Schema Registry using the provided deployment script. + +{{< tabs "Schema Registry - validator component deployment" >}} {{< tab "GCP Pub/Sub" >}} + +### Arguments + +The required arguments are: + +- The Kubernetes namespaces to deploy the validator component to +- Producer Pub/Sub valid topic ID +- Producer Pub/Sub dead-letter topic ID +- Expected message format validated by this validator (json, avro, protobuf, csv, xml) +- Consumer GCP Project ID +- Consumer Pub/Sub Subscription ID (created beforehand) +- Producer GCP Project ID + +{{< /tab >}} {{< tab "Azure (Service Bus)" >}} + +### Arguments + +Required arguments are: + +- The Kubernetes namespaces to deploy the validator component to +- Producer ServiceBus valid topic ID +- Producer ServiceBus dead-letter topic ID +- Expected message format validated by this validator (json, avro, protobuf, csv, xml) +- Consumer ServiceBus Connection String +- Consumer ServiceBus Topic +- Consumer ServiceBus Subscription +- Producer ServiceBus Connection String + +{{< /tab >}} + +{{< tab "Kafka" >}} + +### Arguments + +Required arguments are: + +- The Kubernetes namespaces to deploy the validator component to +- Producer Kafka valid topic ID +- Producer Kafka dead-letter topic ID +- Expected message format validated by this validator (json, avro, protobuf, csv, xml) +- Consumer Kafka broker address +- Consumer Kafka Topic +- Consumer Kafka Group ID +- Producer Kafka broker address + +{{< /tab >}} + +{{< tab "Kafka to Pub/Sub (Consumer Kafka, producer GCP Pub/Sub)" >}} + +### Arguments + +Required arguments are: + +- The Kubernetes namespaces to deploy the validator component to +- Producer Kafka valid topic ID +- Producer Kafka dead-letter topic ID +- Expected message format validated by this validator (json, avro, protobuf, csv, xml) +- Consumer Kafka Connection String +- Consumer Kafka Topic +- Consumer Kafka Subscription +- Producer GCP Project ID + +{{< /tab >}} + +{{< /tabs >}} + +### Deployment + +Each chart has its own configuration settings outlined in its respective subfolder. A `values.yaml` file should be prepared and pass to Helm while performing the installation. Chart can be accessed on the [Helm repository](https://github.com/dataphos/dataphos-helm/tree/main/dataphos-schema-registry-validator). + +To deploy the `dataphos-schema-validator` chart, run: + +``` +helm install schema-validator ./dataphos-schema-validator +``` + +This would cause the `values.yaml` file to be read from the root directory of the `dataphos-schema-validator` folder. The `--values` flag may be passed in the call to override this behavior. + +You can also add a `--dry-run` flag that will simply generate the Kubernetes manifests and check if they are valid (note that this requires `kubectl` to be configured against an actual cluster). For general linting of the Helm templates, run `helm lint`. \ No newline at end of file diff --git a/dataphos-docs/content/schema_registry/quickstart/pulumi.md b/dataphos-docs/content/schema_registry/quickstart/pulumi.md new file mode 100644 index 0000000..481cbdd --- /dev/null +++ b/dataphos-docs/content/schema_registry/quickstart/pulumi.md @@ -0,0 +1,161 @@ +--- +title: "Pulumi" +draft: false +weight: 3 +--- + +## Setting up your environment + +### Prerequisites + +Schema Registry components run in a Kubernetes environment. This quickstart guide will assume that you have [Python 3](https://www.python.org/downloads/) and [Pulumi](https://www.pulumi.com/docs/install/) tools installed. Pulumi repository can be accessed on the [Pulumi repository](https://github.com/dataphos/dataphos-infra). +Schema Registry has multiple message broker options. This quickstart guide will assume creating new resources instead of importing existing ones into the active stack. If you wish to import your own resources check [Deployment Customization](/schema_registry/configuration/pulumi). + +### Schema Registry namespace + +The namespace where the components will be deployed is defined in the config file, you don't have to create it by yourself. We will use the namespace `dataphos` in this guide. + +```bash + namespace: dataphos +``` + +### Download the Schema Registry Helm charts + +The Dataphos Helm charts are located in the [Dataphos Helm Repository](https://github.com/dataphos/dataphos-helm). + +To properly reference the Schema Registry charts, clone the Helm repository and copy the entire `dataphos-schema-registry` and `dataphos-schema-validator` directories into the `helm_charts` directory of this repository. + +### Install Dependencies + +Create a virtual environment from the project root directory and activate it: + +```bash +py -m venv venv +./venv/Scripts/activate +``` + +Install package dependencies: +```bash +py -m pip install -r ./requirements.txt +``` + +Note: This usually doesn't take long, but can take up to 45 minutes, depending on your setup. + +## Deployment + +Schema registry is separated into two components: the **registry** component and the **validator** component. + +The registry component is used as a central schema management system that provides options of schema registration and +versioning as well as schema validity and compatibility checking. Therefore, it is usually deployed only once. + +The validator component acts as a message validation system, meaning that it consists of validators that validate the +message for the given message schema. The validator supports JSON, AVRO, ProtoBuf, XML and CSV message formats. The idea is +to have multiple validator components for every topic you wish to validate the schemas for and therefore the validator +component might be deployed multiple times. + +### Cloud provider and stack configuration + + +{{< tabs "Schema Registry - validator component deployment" >}} +{{< tab "GCP Pub/Sub" >}} + +Deploy all of the required Schema Registry components for publishing messages to the PubSub topic + +Install the Google Cloud SDK and then authorize access with a user account. Next, Pulumi requires default application credentials to interact with your Google Cloud resources, so run auth application-default login command to obtain those credentials: + +```bash +$ gcloud auth application-default login +``` + +### Configure your stack + +You can use a stack configuration template file to quickly deploy and modify common architectures. This repository includes a set of pre-configured templates for different combinations of Dataphos components and cloud providers. Configuration specifics can be found in the Configuration section of this manual. + +To start using a stack template, copy the desired file from the config_templates directory into the project root directory. Next, create a new stack to contain your infrastructure configuration. Make sure to use the name of a a pre-configured stack template for your stack. + +```bash +$ pulumi stack init schemaregistry-gcp-pubsub-dev +``` +This will create a new stack named `schemaregistry-gcp-pubsub-dev` in your project and set it as the active stack. + + + +{{< /tab >}} +{{< tab "Azure (Service Bus)" >}} + +Deploy all of the required Schema Registry components for consuming messages from a Service Bus topic. + +Log in to the Azure CLI and Pulumi will automatically use your credentials: +```bash +$ az login +``` + +### Configure your stack +You can use a stack configuration template file to quickly deploy and modify common architectures. This repository includes a set of pre-configured templates for different combinations of Dataphos components and cloud providers.Configuration specifics can be found in the Configuration section of this manual. + +To start using a stack template, copy the desired file from the config_templates directory into the project root directory. Next, create a new stack to contain your infrastructure configuration. Make sure to use the name of a a pre-configured stack template for your stack. + +```bash +$ pulumi stack init schemaregistry-azure-sb-dev +``` +This will create a new stack named `schemaregistry-azure-sb-dev` in your project and set it as the active stack. + + +{{< /tab >}} + +{{< tab "Kafka on Azure" >}} +Deploy all of the required Schema Registry components for consuming messages from a Kafka topic. + +Log in to the Azure CLI and Pulumi will automatically use your credentials: +```bash +$ az login +``` + +### Configure your stack +You can use a stack configuration template file to quickly deploy and modify common architectures. This repository includes a set of pre-configured templates for different combinations of Dataphos components and cloud providers.Configuration specifics can be found in the Configuration section of this manual. + +To start using a stack template, copy the desired file from the config_templates directory into the project root directory. Next, create a new stack to contain your infrastructure configuration. Make sure to use the name of a a pre-configured stack template for your stack. + +```bash +$ pulumi stack init schemaregistry-azure-kafka-dev +``` +This will create a new stack named `schemaregistry-azure-kafka-dev` in your project and set it as the active stack. +{{< /tab >}} + +{{< tab "Kafka on GCP" >}} + +Deploy all of the required Schema Registry components for consuming messages from a Kafka topic. + +Install the Google Cloud SDK and then authorize access with a user account. Next, Pulumi requires default application credentials to interact with your Google Cloud resources, so run auth application-default login command to obtain those credentials: + +```bash +$ gcloud auth application-default login +``` + +### Configure your stack + +You can use a stack configuration template file to quickly deploy and modify common architectures. This repository includes a set of pre-configured templates for different combinations of Dataphos components and cloud providers.Configuration specifics can be found in the Configuration section of this manual. + +To start using a stack template, copy the desired file from the config_templates directory into the project root directory. Next, create a new stack to contain your infrastructure configuration. Make sure to use the name of a a pre-configured stack template for your stack. + +```bash +$ pulumi stack init schemaregistry-gcp-kafka-dev +``` +This will create a new stack named `schemaregistry-gcp-kafka-dev` in your project and set it as the active stack. + +{{< /tab >}} + +{{< /tabs >}} + +### Deployment + +Preview and deploy infrastructure changes: +```bash +$ pulumi up +``` +Destroy your infrastructure changes: +```bash +$ pulumi destroy +``` + +Following the deployment, the Schema Registry components will begin automatically pulling data from the configured topic and delivering it to the target storage destination. \ No newline at end of file diff --git a/dataphos-docs/content/schema_registry/quickstart/shell.md b/dataphos-docs/content/schema_registry/quickstart/shell.md new file mode 100644 index 0000000..48a7f16 --- /dev/null +++ b/dataphos-docs/content/schema_registry/quickstart/shell.md @@ -0,0 +1,221 @@ +--- +title: "Shell" +draft: false +weight: 1 +--- + +## Setting up your environment + +### Prerequisites + +Schema Registry components run in a Kubernetes environment. This quickstart guide will assume that you have +the ```kubectl``` tool installed and a running Kubernetes cluster on one of the major cloud providers (GCP, Azure) and a +connection with the cluster. The Kubernetes cluster node/nodes should have at least 8 GB of available RAM. + +Schema Registry has multiple message broker options. This quickstart guide will assume that the publishing message +broker and the consuming message broker will be either GCP Pub/Sub, Azure ServiceBus or Kafka, and that you have +created: + +- (in case of GCP Pub/Sub) service account JSON key with the appropriate roles (Pub/Sub Publisher, Pub/Sub Subscriber) ([Service Account Creation](https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console), [JSON Key Retrieval](https://cloud.google.com/iam/docs/keys-create-delete)) +- (in case of Azure ServiceBus) ServiceBus connection string +- (in case of Kafka) Kafka broker. You may deploy one onto your Kubernetes environment via [Strimzi](https://strimzi.io/docs/operators/0.30.0/quickstart.html). +- An input topic and subscription (The input topic refers to the topic that contains the data in its original + format) +- Valid topic and subscription (The valid topic refers to the topic where the data is stored after being validated + and serialized using a specific schema) +- Dead-letter topic and subscription (The valid topic refers to the topic where messages that could not be processed + by a consumer are stored for troubleshooting and analysis purposes) +- (optional) Prometheus server for gathering the metrics and monitoring the logs + - Can be deployed quickly using [this deployment script](/referenced-scripts/deployment-scripts/prometheus/#bash) + +Note that in case of Kafka, no subscription resource is required. + +> **NOTE:** All the deployment scripts can be found [here](/referenced-scripts/). + +### Create the Schema Registry namespace + +Before deploying the Schema Registry, the namespace where the components will be deployed should be created if it +doesn't exist. + +Open a command line tool of your choice and connect to your cluster. Create the namespace where Schema Registry will be +deployed. We will use namespace `dataphos` in this quickstart guide. + +```bash +kubectl create namespace dataphos +``` + +## Deployment + +Schema registry is separated into two components: the **registry** component and the **validator** component. + +The registry component is used as a central schema management system that provides options for schema registration and +versioning as well as schema validity and compatibility checking. Therefore, it is usually deployed only once. + +The validator component acts as a message validation system, meaning that it consists of validators that validate the +message for the given message schema. The validator supports JSON, AVRO, ProtoBuf, XML and CSV message formats. The idea is +to have multiple validator components for every topic you wish to validate the schemas for and therefore the validator +component might be deployed multiple times. + +## Deploy the Schema Registry - Registry Component + +You can deploy the **Registry** server component using the provided deployment script. + +### Arguments + +The required arguments are: + +- The Kubernetes namespace you will be deploying the registry to +- Schema History Postgres database password + +### Deployment + +The script can be found [here](/referenced-scripts/deployment-scripts/schemaregistry/#schema-registry-api). To run the script, run the +following command: + +```bash +# "dataphos" is an example of the namespace name +# "p4sSw0rD" is example of the Schema History Postgres password +./sr_registry.sh dataphos p4sSw0rD +``` + +## Deploy the Schema Registry - Validator Component + +You can deploy the **Validator** component of the Schema Registry using the provided deployment script. + +{{< tabs "Schema Registry - validator component deployment" >}} {{< tab "GCP Pub/Sub" >}} + +### Arguments + +The required arguments are: + +- The Kubernetes namespaces to deploy the validator component to +- Producer Pub/Sub valid topic ID +- Producer Pub/Sub dead-letter topic ID +- Expected message format validated by this validator (json, avro, protobuf, csv, xml) +- Consumer GCP Project ID +- Consumer Pub/Sub Subscription ID (created beforehand) +- Producer GCP Project ID + +### Deployment + +The script can be found [here](/referenced-scripts/deployment-scripts/schemaregistry/#schema-registry-validator-pubsub). To run the script, run the +following command: + +```bash +# "dataphos" is an example of the namespace name +# "valid-topic" is example of the valid topic name +# "dead-letter-topic" is example of the dead-letter topic name +# "json" is example of the message format name (needs to be either "json", "avro", "csv", "xml", "protobuf") +# "dataphos-project" is example of the consumer GCP project ID +# "input-topic-sub" is example of the input topic subcription name +# "dataphos-project" is example of the producer GCP project ID + +./validator-pubsub.sh "dataphos" "valid-topic" "dead-letter-topic" "json" "dataphos-project" "input-topic-sub" "dataphos-project" +``` + +{{< /tab >}} {{< tab "Azure (Service Bus)" >}} + +### Arguments + +Required arguments are: + +- The Kubernetes namespaces to deploy the validator component to +- Producer ServiceBus valid topic ID +- Producer ServiceBus dead-letter topic ID +- Expected message format validated by this validator (json, avro, protobuf, csv, xml) +- Consumer ServiceBus Connection String +- Consumer ServiceBus Topic +- Consumer ServiceBus Subscription +- Producer ServiceBus Connection String + +### Deployment + +The script can be found [here](/referenced-scripts/deployment-scripts/schemaregistry/#schema-registry-validator-servicebus) To run the script, run the +following command: + +```bash +# "dataphos" is an example of the namespace name +# "valid-topic" is example of the valid topic name +# "dead-letter-topic" is example of the dead-letter topic name +# "json" is example of the message format name (needs to be either "json", "avro", "csv", "xml", "protobuf") +# "Endpoint=sb://foo.servicebus.windows.net/;SharedAccessKeyName=someKeyName;SharedAccessKey=someKeyValue" is example of the consumer ServiceBus connection string (https://azurelessons.com/azure-service-bus-connection-string/) +# "input-topic" is example of the input topic name +# "input-topic-sub" is example of the input topic subcription name +# "Endpoint=sb://foo.servicebus.windows.net/;SharedAccessKeyName=someKeyName;SharedAccessKey=someKeyValue" is example of the producer ServiceBus connection string (https://azurelessons.com/azure-service-bus-connection-string/) + +./validator-servicebus.sh "dataphos" "valid-topic" "dead-letter-topic" "json" "Endpoint=sb://foo.servicebus.windows.net/;SharedAccessKeyName=someKeyName;SharedAccessKey=someKeyValue" "input-topic" "input-topic-sub" "Endpoint=sb://foo.servicebus.windows.net/;SharedAccessKeyName=someKeyName;SharedAccessKey=someKeyValue" +``` + +{{< /tab >}} + +{{< tab "Kafka" >}} + +### Arguments + +Required arguments are: + +- The Kubernetes namespaces to deploy the validator component to +- Producer Kafka valid topic ID +- Producer Kafka dead-letter topic ID +- Expected message format validated by this validator (json, avro, protobuf, csv, xml) +- Consumer Kafka broker address +- Consumer Kafka Topic +- Consumer Kafka Group ID +- Producer Kafka broker address + +### Deployment + +The script can be found [here](/referenced-scripts/deployment-scripts/schemaregistry/#schema-registry-validator-kafka) To run the script, run the +following command: + +```bash +# "dataphos" is an example of the namespace name +# "valid-topic" is example of the valid topic name +# "dead-letter-topic" is example of the dead-letter topic name +# "json" is example of the message format name (needs to be either "json", "avro", "csv", "xml", "protobuf") +# "127.0.0.1:9092" is example of the consumer Kafka broker address +# "input-topic" is example of the input topic name +# "group01" is example of the input topic group ID +# "127.0.0.1:9092" is example of the producer Kafka broker address + +./validator-kafka.sh "dataphos" "valid-topic" "dead-letter-topic" "json" "127.0.0.1:9092" "input-topic" "group01" "127.0.0.1:9092" +``` + +{{< /tab >}} + +{{< tab "Kafka to Pub/Sub (Consumer Kafka, producer GCP Pub/Sub)" >}} + +### Arguments + +Required arguments are: + +- The Kubernetes namespaces to deploy the validator component to +- Producer Kafka valid topic ID +- Producer Kafka dead-letter topic ID +- Expected message format validated by this validator (json, avro, protobuf, csv, xml) +- Consumer Kafka Connection String +- Consumer Kafka Topic +- Consumer Kafka Subscription +- Producer GCP Project ID + +### Deployment + +The script can be found [here](/referenced-scripts/deployment-scripts/schemaregistry/#schema-registry-validator-kafka-to-pubsub) To run the script, run the +following command: + +```bash +# "dataphos" is an example of the namespace name +# "valid-topic" is example of the valid topic name +# "dead-letter-topic" is example of the dead-letter topic name +# "json" is example of the message format name (needs to be either "json", "avro", "csv", "xml", "protobuf") +# "127.0.0.1:9092" is example of the consumer Kafka broker address +# "input-topic" is example of the input topic name +# "group01" is example of the input topic group ID +# "dataphos-project" is example of the producer GCP project ID + +./validator-kafka-to-pubsub.sh "dataphos" "valid-topic" "dead-letter-topic" "json" "" "input-topic" "group01" "dataphos-project" +``` + +{{< /tab >}} + +{{< /tabs >}} diff --git a/dataphos-docs/content/schema_registry/usage.md b/dataphos-docs/content/schema_registry/usage.md new file mode 100644 index 0000000..91ddb75 --- /dev/null +++ b/dataphos-docs/content/schema_registry/usage.md @@ -0,0 +1,294 @@ +--- +title: "Usage" +draft: false +weight: 2 +--- + +# The Schema Registry REST API + +Even thought the Schema Registry provides REST API for registering, updating, fetching a schema, fetching all the +versions, fetching the latest, deleting a schema, etc. We will showcase here only the requests to register, update and +fetch a schema. + +{{< tabs "Schema Registry REST API" >}} {{< tab "Register a schema" >}} + +## Register a schema + +After the Schema Registry is deployed you will have access to its API endpoint. To register a schema, you have to send a +POST request to the endpoint ```http://schema-registry-svc:8080/schemas``` in whose body you need to provide the name of the +schema, description, schema_type, specification (the schema), compatibility and validity mode. + +The compatibility type determines how the Schema Registry compares the new schema with previous versions of a schema, +for a given subject. The Dataphos Schema Registry default compatibility type is ```BACKWARD```. All the compatibility +types are described in more detail in the sections below. + +| Compatibility Type | Changes allowed | Check against which schemas | Upgrade first | Description | +|---------------------|-----------------------------------------------|---------------------------------|---------------|------------------------------------------------------------------------------------------------| +| BACKWARD | Delete fields
Add optional fields | Last version | Consumers | Being able to understand messages from the last schema and the current schema. | +| BACKWARD_TRANSITIVE | Delete fields
Add optional fields | All previous versions | Consumers | Being able to understand messages from all the previous schema versions and the current schema.| +| FORWARD | Add fields
Delete optional fields | Last version | Producers | Being able to understand messages from the current schema and the next schema. | +| FORWARD_TRANSITIVE | Add fields
Delete optional fields | All previous versions | Producers | Being able to understand messages from the current schema and all the next schema versions. | +| FULL | Add optional fields
Delete optional fields | Last version | Any order | Being both backward and forward compatible. | +| FULL_TRANSITIVE | Add optional fields
Delete optional fields | All previous versions | Any order | Being both backward_transitive and forward_transitive compatible. | +| NONE | All changes are accepted | Compatibility checking disabled | Depends | All changes in the messages are acceptible. | + + +The validity type determines how strict the Schema Registry will be when registering a schema. Meaning, will it demand +that the schema is compliant with the rules of the data format or with the schema rules. +The Dataphos Schema Registry default validity type is ```FULL```. Possible values for the validity mode are: ```FULL```, +```NONE```, ```SYNTAX_ONLY```. + +``` +{ + "description": "new json schema for testing", + "schema_type": "json", + "specification": "{\r\n \"$id\": \"https://example.com/person.schema.json\",\r\n \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\r\n \"title\": \"Person\",\r\n \"type\": \"object\",\r\n \"properties\": {\r\n \"firstName\": {\r\n \"type\": \"string\",\r\n \"description\": \"The person's first name.\"\r\n },\r\n \"lastName\": {\r\n \"type\": \"string\",\r\n \"description\": \"The person's last name.\"\r\n },\r\n \"age\": {\r\n \"description\": \"Age in years which must be equal to or greater than zero.\",\r\n \"type\": \"integer\",\r\n \"minimum\": 0\r\n }\r\n }\r\n}\r\n", + "name": "schema json", + "compatibility_mode": "BACKWARD", + "validity_mode": "FULL" +} +``` + +Using curl: + +``` bash +curl -XPOST -H "Content-type: application/json" -d '{ + "description": "new json schema for testing", + "schema_type": "json", + "specification": "{\r\n \"$id\": \"https://example.com/person.schema.json\",\r\n \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\r\n \"title\": \"Person\",\r\n \"type\": \"object\",\r\n \"properties\": {\r\n \"firstName\": {\r\n \"type\": \"string\",\r\n \"description\": \"The person's first name.\"\r\n },\r\n \"lastName\": {\r\n \"type\": \"string\",\r\n \"description\": \"The person's last name.\"\r\n },\r\n \"age\": {\r\n \"description\": \"Age in years which must be equal to or greater than zero.\",\r\n \"type\": \"integer\",\r\n \"minimum\": 0\r\n }\r\n }\r\n}\r\n", + "name": "schema json", + "compatibility_mode": "BACKWARD", + "validity_mode": "FULL" +}' 'http://schema-registry-svc:8080/schemas/' +``` + +The response to the schema registration request will be: + +- STATUS 201 Created + ```json + { + "identification": "32", + "version": "1", + "message": "schema successfully created" + } + ``` + +- STATUS 409 Conflict -> indicating that the schema already exists + ```json + { + "identification": "32", + "version": "1", + "message": "schema already exists at id=32" + } + ``` + +- STATUS 500 Internal Server Error -> indicating a server error, which means that either the request is not correct ( + missing fields) or that the server is down. + ```json + { + "message": "Internal Server Error" + } + ``` + +{{< /tab >}} {{< tab "Update a schema" >}} + +## Update a schema + +After the Schema Registry is registered you can update it by registering a new version under that schema ID. To update a +schema, you have to send a PUT request to the endpoint ```http://schema-registry-svc:8080/schemas/``` in whose body +you need to provide the description (optional) of the version and the specification (the schema) + +```json +{ + "description": "added field for middle name", + "specification": "{\r\n \"$id\": \"https://example.com/person.schema.json\",\r\n \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\r\n \"title\": \"Person\",\r\n \"type\": \"object\",\r\n \"properties\": {\r\n \"firstName\": {\r\n \"type\": \"string\",\r\n \"description\": \"The person's first name.\"\r\n },\r\n \"lastName\": {\r\n \"type\": \"string\",\r\n \"description\": \"The person's last name.\"\r\n },\r\n \"lastName\": {\r\n \"type\": \"string\",\r\n \"description\": \"The person's last name.\"\r\n },\r\n \"age\": {\r\n \"description\": \"Age in years which must be equal to or greater than zero.\",\r\n \"type\": \"integer\",\r\n \"minimum\": 0\r\n }\r\n }\r\n}\r\n" +} +``` + +Using curl: + +```bash +curl -XPUT -H "Content-type: application/json" -d '{ + "description": "added field for middle name", + "specification": "{\r\n \"$id\": \"https://example.com/person.schema.json\",\r\n \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\r\n \"title\": \"Person\",\r\n \"type\": \"object\",\r\n \"properties\": {\r\n \"firstName\": {\r\n \"type\": \"string\",\r\n \"description\": \"The person's first name.\"\r\n },\r\n \"lastName\": {\r\n \"type\": \"string\",\r\n \"description\": \"The person's last name.\"\r\n },\r\n \"lastName\": {\r\n \"type\": \"string\",\r\n \"description\": \"The person's last name.\"\r\n },\r\n \"age\": {\r\n \"description\": \"Age in years which must be equal to or greater than zero.\",\r\n \"type\": \"integer\",\r\n \"minimum\": 0\r\n }\r\n }\r\n}\r\n" +}' 'http://schema-registry-svc:8080/schemas/' +``` + +The response to the schema updating request will be the same as for registering except when the updating is done +successfully it will be status 200 OK and a new version will be provided. + +```json +{ + "identification": "32", + "version": "2", + "message": "schema successfully updated" +} +``` + +{{< /tab >}} {{< tab "Fetch a schema version" >}} + +## Fetch a schema version + +To get a schema version and its relevant details, a GET request needs to be made and the endpoint needs to be: + +``` +http://schema-registry-svc:8080/schemas//versions/ +``` + +Using curl: + +```bash +curl -XGET -H "Content-type: application/json" 'http://schema-registry-svc:8080/schemas//versions/' +``` + +The response to the schema registration request will be: + +- STATUS 200 OK + ```json + { + "id": "32", + "version": "1", + "schema_id": "32", + "specification": "ew0KICAiJHNjaGVtYSI6ICJodHRwOi8vanNvbi1zY2hlbWEub3JnL2RyYWZ0LTA3L3NjaGVtYSIsDQogICJ0eXBlIjogIm9iamVjdCIsDQogICJ0aXRsZSI6ICJUaGUgUm9vdCBTY2hlbWEiLA0KICAiZGVzY3JpcHRpb24iOiAiVGhlIHJvb3Qgc2NoZW1hIGNvbXByaXNlcyB0aGUgZW50aXJlIEpTT04gZG9jdW1lbnQuIiwNCiAgImRlZmF1bHQiOiB7fSwNCiAgImFkZGl0aW9uYWxQcm9wZXJ0aWVzIjogdHJ1ZSwNCiAgInJlcXVpcmVkIjogWw0KICAgICJwaG9uZSINCiAgXSwNCiAgInByb3BlcnRpZXMiOiB7DQogICAgInBob25lIjogew0KICAgICAgInR5cGUiOiAiaW50ZWdlciIsDQogICAgICAidGl0bGUiOiAiVGhlIFBob25lIFNjaGVtYSIsDQogICAgICAiZGVzY3JpcHRpb24iOiAiQW4gZXhwbGFuYXRpb24gYWJvdXQgdGhlIHB1cnBvc2Ugb2YgdGhpcyBpbnN0YW5jZS4iLA0KICAgICAgImRlZmF1bHQiOiAiIiwNCiAgICAgICJleGFtcGxlcyI6IFsNCiAgICAgICAgMQ0KICAgICAgXQ0KICAgIH0sDQogICAgInJvb20iOiB7DQogICAgICAidHlwZSI6ICJpbnRlZ2VyIiwNCiAgICAgICJ0aXRsZSI6ICJUaGUgUm9vbSBTY2hlbWEiLA0KICAgICAgImRlc2NyaXB0aW9uIjogIkFuIGV4cGxhbmF0aW9uIGFib3V0IHRoZSBwdXJwb3NlIG9mIHRoaXMgaW5zdGFuY2UuIiwNCiAgICAgICJkZWZhdWx0IjogIiIsDQogICAgICAiZXhhbXBsZXMiOiBbDQogICAgICAgIDEyMw0KICAgICAgXQ0KICAgIH0NCiAgfQ0KfQ==", + "description": "new json schema for testing", + "schema_hash": "72966008fdcec8627a0e43c5d9a247501fc4ab45687dd2929aebf8ef3eb06ccd", + "created_at": "2023-05-09T08:38:54.5515Z", + "autogenerated": false + } + ``` +- STATUS 404 Not Found -> indicating that the wrong schema ID or schema version was provided +- STATUS 500 Internal Server Error -> indicating a server error, which means that either the request is not correct ( + wrong endpoint) or that the server is down. + +{{< /tab >}} {{< tab "Other requests" >}} + +## Other requests + +| Description | Method | URL | Headers | Body | +|:-------------------------------------------------:|--------|:-----------------------------------------------------------------:|:-----------------------------------:|:---------------------------------:| +| Get all the schemas | GET | http://schema-registry-svc/schemas | Content-Type: application/json | This request does not have a body | +| Get all the schema versions of the specified ID | GET | http://schema-registry-svc/schemas/{id}/versions | Content-Type: application/json | This request does not have a body | +| Get the latest schema version of the specified ID | GET | http://schema-registry-svc/schemas/{id}/versions/latest | Content-Type: application/json | This request does not have a body | +| Get schema specification by id and version | GET | http://schema-registry-svc/schemas/{id}/versions/{version}/spec | Content-Type: application/json
| This request does not have a body | +| Delete the schema under the ID | DELETE | http://schema-registry-svc/schemas/{id} | Content-Type: application/json | This request does not have a body | +| Delete the schema by id and version | DELETE | http://schema-registry-svc/schemas/{id}/versions/{version} | Content-Type: application/json | This request does not have a body | + + +## Schema search + +Aside from fetching schemas by their ID and version, they can also be fetched using search endpoint. Schemas can be +searched on the "/schemas/search?" endpoint, following the search condition. There can be multiple criteria for the +search, and they are in the following format: *par1=val1&par2=val2&par3=val3*. +The parameters that schema can be searched upon are as follows: +- id +- version +- type +- name +- attributes + +Additionally, they can be ordered by these parameters (except for attributes) in ascending/descending order (the default +parameter to order by is ID), as well as limited to a certain number of items. The table below shows a few example of +schema search requests (none have body). + +| Description | Method | URL | Headers | +|:------------------------------------------------------------------------------------------:|--------|:---------------------------------------------------------------------------------------:|:-----------------------------------:| +| Get all schemas that contain *schema_name* in their name (case sensitive) | GET | http://schema-registry-svc/schemas/search?name=schema_name | Content-Type: application/json | +| Get all schemas that contain *schema_name* in their name and type json | GET | http://schema-registry-svc/schemas/search?name=schema_name&type=json | Content-Type: application/json | +| Get a schema with an *ID* | GET | http://schema-registry-svc/schemas/search?id=ID | Content-Type: application/json | +| Get a schema with an *ID* in descending order | GET | http://schema-registry-svc/schemas/search?id=ID&sort=desc | Content-Type: application/json | +| Get up to 50 schemas whose name *schema_name* in ascending order in respect to their names | GET | http://schema-registry-svc/schemas/search?id=schema_name&orderBy=name&sort=asc&limit=50 | Content-Type: application/json | +| Get a schema whose name contains *schema_name* and attributes *attr1* and *attr2* | GET | http://schema-registry-svc/schemas/search?name=schema_name&attributes=attr1,attr2 | Content-Type: application/json | + + +{{< /tab >}} {{< /tabs >}} + +# Validator message format + +Depending on the technology your producer uses, the way you shape the message may differ and therefore the part of the +message that contains the metadata might be called ```attributes```, ```metadata,``` etc. + +Besides the data field, which contains the message data, inside the attributes (or metadata) structure it's important to +add fields ```schemaId```, ```versionId``` and ```format``` +which are important information for the validator component. In case some additional attributes are provided, the validator +won't lose them, they will be delegated to the destination topic. + +{{< tabs "Schema Registry - validator message format" >}} {{< tab "Pub/Sub" >}} + +```json +{ + "ID": "string", + "Data": "string", + "Attributes": { + "schemaId": "string", + "versionId": "string", + "format": "string", + // ... + }, + "PublishTime": "time", +} +``` + +| Field | Description | +|------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Data | **string** (bytes format)

The message data field. If this field is empty, the message must contain at least one attribute.

A base64-encoded string. | +| Attributes | **map** (key: string, value: string)

Attributes for this message. If this field is empty, the message must contain non-empty data. This can be used to filter messages on the subscription.

An object containing a list of "key": value pairs. Example: { "schemaId": "1", "versionId": "2", "format": "json" }. | +| PublishTime| **time** (time.Time format)

PublishTime is the time at which the message was published. This is populated by the server for Messages obtained from a subscription.| + +{{< /tab >}} + +{{< tab "ServiceBus" >}} + +```json +{ + "MessageID": "string", + "Body": "string", + "PartitionKey": "string", + "ApplicationProperties": { + "schemaId": "string", + "versionId": "string", + "format": "string", + // ... + }, + "EnqueuedTime": "time" +} +``` + +| Field | Description | +|------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Body | **string** (bytes format)

The message data field. If this field is empty, the message must contain at least one application property. | +| ApplicationProperties | **map** (key: string, value: string)

Attributes for this message. ApplicationProperties can be used to store custom metadata for a message.

An object containing a list of "key": value pairs. Example: { "schemaId": "1", "versionId": "2", "format": "json" }. | +| PartitionKey| **string**

PartitionKey is used with a partitioned entity and enables assigning related messages to the same internal partition. This ensures that the submission sequence order is correctly recorded. The partition is chosen by a hash function in Service Bus and cannot be chosen directly.| +| EnqueuedTime| **time** (time.Time format)

EnqueuedTime is the UTC time when the message was accepted and stored by Service Bus.| + +{{< /tab >}} + +{{< tab "Kafka" >}} + +```json +{ + "Key": "string", + "Value": "string", + "Offset": "int64", + "Partition": "int32", + "Headers": { + "schemaId": "string", + "versionId": "string", + "format": "string", + // ... + }, + "Timestamp": "time" +} +``` + +| Field | Description | +|------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Key | **string** (bytes format)

Key is an optional field that can be used for partition assignment. | +| Value | **string** (bytes format)

Value is blob of data to write to Kafka. | +| Offset | **int64**

Offset is the offset that a record is written as.| +| Partition | **int32**

Partition is the partition that a record is written to.| +| Headers | **map** (key: string, value: string)

Headers are optional key/value pairs that are passed along with records.

Example: { "schemaId": "1", "versionId": "2", "format": "json" }.

These are purely for producers and consumers; Kafka does not look at this field and only writes it to disk. | +| Timestamp| **time** (time.Time format)

Timestamp is the timestamp that will be used for this record. Record batches are always written with "CreateTime", meaning that timestamps are generated by clients rather than brokers.| + +{{< /tab >}} +{{< /tabs >}} + diff --git a/dataphos-docs/content/schema_registry/videos-and-blogs.md b/dataphos-docs/content/schema_registry/videos-and-blogs.md new file mode 100644 index 0000000..4b066d2 --- /dev/null +++ b/dataphos-docs/content/schema_registry/videos-and-blogs.md @@ -0,0 +1,19 @@ +--- +title: "Videos and Blogs" +draft: false +weight: 5 +--- +## Blogs +[Schema Registry business blog](https://www.syntio.net/en/labs-musings/building-a-data-driven-culture-with-dataphos-schema-registry?utm_source=DataphosDocs&utm_medium=Documentation&utm_campaign=DataphosDocs) + +[Schema Registry technical blog](https://www.syntio.net/en/labs-musings/demystifying-the-dataphos-schema-registry-a-technical-deep-dive?utm_source=DataphosDocs&utm_medium=Documentation&utm_campaign=DataphosDocs) + + +## Videos +Schema Registry Overview and Demo + +[![Schema Registry - A Data Platform component by Syntio - Showcase](/sr_thumbnail_overview.jpg)](https://youtu.be/X9JH3MyCM0E "Schema Registry - A Data Platform component by Syntio - Showcase") + +Schema Registry Deployment Guide + +[![Schema Registry - A Data Platform component by Syntio - Deployment Guide](/sr_thumbnail_deployment.jpg)](https://youtu.be/XW3K5riJBEE "Schema Registry - A Data Platform component by Syntio - Deployment Guide") diff --git a/dataphos-docs/content/schema_registry/what-is-schema-registry.md b/dataphos-docs/content/schema_registry/what-is-schema-registry.md new file mode 100644 index 0000000..98cfb3c --- /dev/null +++ b/dataphos-docs/content/schema_registry/what-is-schema-registry.md @@ -0,0 +1,263 @@ +--- +title: "Overview" +draft: false +weight: 1 +--- + +![](/sr.png) + +**Dataphos Schema Registry** is a cloud-based schema management and message validation system. + +Schema management consists of schema registration and versioning, and message validation consists of validators that validate messages for +the given message schema. Its core components are a server with HTTP RESTful interface used to manage the schemas, and +lightweight message validators, which verify the schema compatibility and validity of the incoming messages. + +The system allows developers to define and manage standard schemas for events, sharing them across the organization and safely +evolving them with the preservation of compatibility as well as validating events with a given event schema. +The Schema Registry component stores a versioned history of all schemas and provides a RESTful interface for working with them. + +## What is a schema? + +In the context of a schema registry, a schema is a formal definition of the structure and data types for a particular +data format. The schema defines the rules that govern how data is represented and what values are allowed for each +field. + +For example, if you have a dataset that contains customer information, you might define a schema for that dataset that +specifies the fields that must be present (e.g. name, address, phone number), the data types for each field +(e.g. string, integer, date), and any constraints or rules that apply to the data (e.g. phone numbers must be in a +particular format). + +The schema itself is typically defined using a specific schema language, such as Avro, JSON Schema, Protobuf, etc. The +schema language provides a standardized syntax for defining the schema, which can be used by different systems to ensure +that they're interpreting the schema correctly. + +### Schema Examples + +**Example 1** + +{{< tabs "Schema Example 1" >}} {{< tab "Message" >}} +```json +{ + "firstName": "John", + "lastName": "Doe", + "age": 21 +} + +``` +{{< /tab >}} +{{< tab "Schema" >}} +```json +{ + "$id": "https://example.com/person.schema.json", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "Person", + "type": "object", + "properties": { + "firstName": { + "type": "string", + "description": "The person's first name." + }, + "lastName": { + "type": "string", + "description": "The person's last name." + }, + "age": { + "description": "Age in years which must be equal to or greater than zero.", + "type": "integer", + "minimum": 0 + } + } +} + +``` +{{< /tab >}} +{{< /tabs>}} + +**Example 2** +{{< tabs "Schema Example 2" >}} {{< tab "Message" >}} +```json +{ + "id": 7, + "name": "John Doe", + "age": 22, + "hobbies": { + "indoor": [ + "Chess" + ], + "outdoor": [ + "BasketballStand-up Comedy" + ] + } +} +``` +{{< /tab >}} +{{< tab "Schema" >}} +```json +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "$id": "https://example.com/employee.schema.json", + "title": "Record of employee", + "description": "This document records the details of an employee", + "type": "object", + "properties": { + "id": { + "description": "A unique identifier for an employee", + "type": "number" + }, + "name": { + "description": "Full name of the employee", + "type": "string" + }, + "age": { + "description": "Age of the employee", + "type": "number" + }, + "hobbies": { + "description": "Hobbies of the employee", + "type": "object", + "properties": { + "indoor": { + "type": "array", + "items": { + "description": "List of indoor hobbies", + "type": "string" + } + }, + "outdoor": { + "type": "array", + "items": { + "description": "List of outdoor hobbies", + "type": "string" + } + } + } + } + } +} + + +``` +{{< /tab >}} +{{< /tabs>}} + +## What is a Schema Registry? + +A schema registry is typically used in distributed data architectures where data is produced by one system and consumed +by multiple other systems. Here's an example of how a schema registry might be used in practice: + +Suppose you have a streaming data pipeline that ingests data from multiple sources, processes the data in real-time, and +then outputs the processed data to multiple downstream systems for further analysis. Each source system produces data in +a different format, and each downstream system consumes data in a different format. In order to ensure that the data +flowing through the pipeline is well-formed and compatible with all of the downstream systems, you might use a schema +registry to manage the schemas for the data. + +How it works: + +- The source systems creates a schema (either automated or manually) and registers it in the Schema Registry from which + they receive an ID and Version. + +- The source systems produce data in a particular format, such as Avro, JSON, ProtoBuf, CSV or XML. Before producing + data, they insert the ID and Version received in the previous step in the message metadata. + +- When the data is ingested by the streaming pipeline, the data is validated against the schema definition to ensure + that it conforms to the expected structure and data types. + +- Depending on the validation result, the data will be either sent to the valid topic, where the consumers are subscribed + to, or to the dead-letter topic, where the invalid data will reside and wait for manual inspection. + +By using a schema registry to manage the schemas for the data, you can ensure that the data flowing through the pipeline +is well-formed and compatible with all the downstream systems, reducing the likelihood of data quality issues and +system failures. Additionally, by providing a central location for schema definitions, you can improve collaboration and +communication between teams working with the same data. + +### Use cases + +Some schema registry use cases: + +- Data validation and governance: A schema registry can ensure that the data being produced and consumed by different + systems conform to a specified schema. This helps ensure data quality and consistency across the organization, which + is particularly important in regulated industries. + +- Compatibility checking: As systems evolve over time, it's important to ensure that changes to data schemas are + compatible with existing systems that consume the data. A schema registry can help detect incompatibilities early on + and prevent costly failures downstream. + +- Data discovery: A schema registry can be used to help data consumers discover and understand the structure of data + available in the organization. By providing a central location for data schema definitions, a schema registry makes it + easier for data analysts and engineers to find and understand the data they need. + +- Automation: A schema registry can be integrated with other data tools and processes to automate schema-related tasks, + such as schema validation, schema evolution, and data transformation. + +- Collaboration: By providing a shared location for schema definitions, a schema registry can facilitate collaboration + between different teams and departments working with the same data. This can help reduce duplication of effort and + improve communication between teams. + +## Schema Registry Components + +The Schema Registry system consists of the following two components: **Registry** and **Validators**. + +## Registry + +The Registry, which itself is a database with a REST API on top, is deployed as a deployment on a Kubernetes cluster +which performs the following: + +- Schema registration +- Schema updating (adding a new version of an existing schema) +- Retrieval of existing schemas (specified version or latest version) +- Deleting the whole schema or just specified versions of a schema +- Checking for schema validity (syntactically and semantically) +- Checking for schema compatibility (backward, forward, transitive) + +The main component of the Schema Registry product is entirely independent of the implementation of the data-streaming +platform. It is implemented as a REST API that provides handles (via URL) for clients and communicates via HTTP +requests. + +The validator component communicates with the REST API by sending the HTTP GET request that retrieves a message schema from +the Registry by using the necessary parameters. The message schemas themselves can be stored in any type of database ( +Schema History), whether in tables like in standard SQL databases, such as Oracle or PostgreSQL, or NoSQL databases like +MongoDB. The component itself has an interface with the database connector that can be easily modified to +work with databases that fit the client’s needs. + +## Validator + +The Validator is deployed as a deployment on a Kubernetes cluster and performs the following: + +- Message schema retrieval (and caching) from the Registry using message metadata +- Input message validation using the retrieved schema +- Input message transmission depending on its validation result + +Before the producer starts sending messages their schema needs to be registered in the database, whether it is an +entirely new schema or a new version of an existing one. Each of the messages being sent to the input topic needs to +have its metadata enriched with the schema information, which includes the ID, version and the message format. + +The role of the Validator component is to filter the messages being pushed from the input topic based on the metadata +attributes and route them to their destination. It does so with the help of the Registry component. + +If the schema is registered in the database, the request sent to the Registry will return the schema specification and +the message can be successfully validated and routed to a topic for valid messages. In case of validation failure, the +message will be routed to a topic for dead letter messages. + +Message brokers supported with the Validator component are: + +- GCP Pub/Sub +- Azure ServiceBus +- Azure Event Hubs +- Apache Kafka +- Apache Pulsar +- NATS JetSteam + +Also, the Schema registry enables the use of different protocols for producers and consumers, which ultimately enables +protocol conversion. For example, using the Schema registry protocol conversion you will be able to have a producer that +publishes messages using the Kafka protocol and a consumer that consumes messages using Pub/Sub protocol. + +Providing a data schema and data the validators can determine if the given data is valid for the given schema. Data +types supported are: + +- JSON +- AVRO +- Protocol Buffers +- XML +- CSV + +Instead of logging metrics to standard output, the Validator component has Prometheus support for monitoring and alerting. diff --git a/dataphos-docs/layouts/partials/head/custom.html b/dataphos-docs/layouts/partials/head/custom.html new file mode 100644 index 0000000..3f63aa0 --- /dev/null +++ b/dataphos-docs/layouts/partials/head/custom.html @@ -0,0 +1,2 @@ + +{{ template "_internal/google_analytics.html" . }} \ No newline at end of file diff --git a/dataphos-docs/layouts/partials/site-footer.html b/dataphos-docs/layouts/partials/site-footer.html new file mode 100644 index 0000000..dca1f5c --- /dev/null +++ b/dataphos-docs/layouts/partials/site-footer.html @@ -0,0 +1,46 @@ + + \ No newline at end of file diff --git a/dataphos-docs/layouts/partials/site-header.html b/dataphos-docs/layouts/partials/site-header.html new file mode 100644 index 0000000..9eb0781 --- /dev/null +++ b/dataphos-docs/layouts/partials/site-header.html @@ -0,0 +1,79 @@ +
+
+ {{ if .MenuEnabled }} + + {{ end }} + +
+ + {{ if .Root.Site.Data.menu.extra.header }} + {{ partial "menu-extra" (dict "current" .Root "source" .Root.Site.Data.menu.extra.header "target" "header") }} + {{ end }} + + + + + {{ i18n "button_toggle_dark" }} + + + + {{ i18n "button_toggle_dark" }} + + + + {{ i18n "button_toggle_dark" }} + + + + + + + + {{ i18n "button_homepage" }} + + + + + + {{ partial "language" .Root }} + + + + + + + +
+
+
+ \ No newline at end of file diff --git a/dataphos-docs/layouts/shortcodes/details.html b/dataphos-docs/layouts/shortcodes/details.html new file mode 100644 index 0000000..b6054f9 --- /dev/null +++ b/dataphos-docs/layouts/shortcodes/details.html @@ -0,0 +1,4 @@ +
+ {{ (.Get 0) | markdownify }} + {{ .Inner | markdownify }} +
\ No newline at end of file diff --git a/dataphos-docs/layouts/shortcodes/rawhtml.html b/dataphos-docs/layouts/shortcodes/rawhtml.html new file mode 100644 index 0000000..520ec17 --- /dev/null +++ b/dataphos-docs/layouts/shortcodes/rawhtml.html @@ -0,0 +1,2 @@ + +{{.Inner}} \ No newline at end of file diff --git a/dataphos-docs/static/arch.png b/dataphos-docs/static/arch.png new file mode 100644 index 0000000..4568536 Binary files /dev/null and b/dataphos-docs/static/arch.png differ diff --git a/dataphos-docs/static/brand.png b/dataphos-docs/static/brand.png new file mode 100644 index 0000000..4dbb14c Binary files /dev/null and b/dataphos-docs/static/brand.png differ diff --git a/dataphos-docs/static/custom.css b/dataphos-docs/static/custom.css new file mode 100644 index 0000000..133104f --- /dev/null +++ b/dataphos-docs/static/custom.css @@ -0,0 +1,972 @@ +/* Global customization */ +:root { + --code-max-height: 60rem; + } + + /* Light mode theming */ + :root, + :root[color-theme="light"] { + --header-background: #1a1a1a; + --header-font-color: #ffffff; + + --body-background: #1a1a1a; + --body-font-color: #ced3d8; + + --mark-color: #ffab00; + + --button-background: #be69d8; + --button-border-color: #be69d8; + + --link-color: #be69d8; + --link-color-visited: #c27a9e; + + --code-background: #3c393d; + --code-accent-color: #be69d8; + --code-accent-color-lite: #d896ec; + --code-font-color: #eeeded; + + --code-copy-background: #343a40; + --code-copy-font-color: #6b7784; + --code-copy-border-color: #6b7784; + --code-copy-success-color: #be69d8; + + --accent-color: #2b3035; + --accent-color-lite: #2f353a; + + --control-icons: #b2bac1; + + --footer-background: #1a1a1a; + --footer-font-color: #ffffff; + --footer-link-color: #be69d8; + --footer-link-color-visited: #be69d8; + + } + + +:root .dark-mode-dim .gdoc-markdown img { + filter: brightness(0.75) grayscale(0.2) +} + +:root .gdoc-markdown .gdoc-hint, +:root .gdoc-markdown .gdoc-props__tag, +:root .gdoc-markdown .admonitionblock { + filter: saturate(2.5) brightness(0.85) +} + +:root .gdoc-markdown .gdoc-hint a, +:root .gdoc-markdown .admonitionblock a { + color: var(--hint-link-color) +} + +:root .gdoc-markdown .gdoc-hint a:visited, +:root .gdoc-markdown .admonitionblock a:visited { + color: var(--hint-link-color-visited) +} + +:root .gdoc-markdown .gdoc-hint__title, +:root .gdoc-markdown .admonitionblock table td:first-child { + background-color: rgba(134, 142, 150, .15) +} + +:root .chroma { + color: var(--code-font-color) +} + +:root .chroma .lntable td:nth-child(2) code .hl { + width: auto; + margin-left: -0.5em; + padding: 0 .5em +} + +:root .highlight pre.chroma { + width: 100%; + overflow: auto; + max-height: var(--code-max-height) +} + +:root .chroma .lntable { + border: 1px solid var(--code-accent-color); + border-radius: .15rem; + border-spacing: 0; + padding: 0; + margin: 0; + width: 100%; + display: block; + max-height: var(--code-max-height); + overflow: auto +} + +:root .chroma .lntable pre.chroma { + max-height: none; + border-radius: 0; + margin: 0 +} + +:root .chroma .lntable td:first-child code { + background-color: var(--code-accent-color-lite); + border-right: 1px solid var(--code-accent-color); + padding-left: 0; + padding-right: 0; + border-radius: 0 +} + +:root .chroma .lntable td:nth-child(2) { + width: 100%; + margin-left: 2rem +} + +:root .chroma .x { + color: inherit +} + +:root .chroma .err { + color: inherit +} + +:root .chroma .lntd { + vertical-align: top; + padding: 0; + margin: 0; + border: 0 +} + +:root .chroma .hl { + display: block; + width: 100%; + background-color: #4f1605 +} + +:root .chroma .lnt { + padding: 0 .8em +} + +:root .chroma .ln { + margin-right: .4em; + padding: 0 .4em 0 .4em; + color: #b3b3b3 +} + +:root .chroma .k { + color: #ff79c6 +} + +:root .chroma .kc { + color: #ff79c6 +} + +:root .chroma .kd { + color: #8be9fd; + font-style: italic +} + +:root .chroma .kn { + color: #ff79c6 +} + +:root .chroma .kp { + color: #ff79c6 +} + +:root .chroma .kr { + color: #ff79c6 +} + +:root .chroma .kt { + color: #8be9fd +} + +:root .chroma .n { + color: inherit +} + +:root .chroma .na { + color: #50fa7b +} + +:root .chroma .nb { + color: #8be9fd; + font-style: italic +} + +:root .chroma .bp { + color: inherit +} + +:root .chroma .nc { + color: #50fa7b +} + +:root .chroma .no { + color: inherit +} + +:root .chroma .nd { + color: inherit +} + +:root .chroma .ni { + color: inherit +} + +:root .chroma .ne { + color: inherit +} + +:root .chroma .nf { + color: #50fa7b +} + +:root .chroma .fm { + color: inherit +} + +:root .chroma .nl { + color: #8be9fd; + font-style: italic +} + +:root .chroma .nn { + color: inherit +} + +:root .chroma .nx { + color: inherit +} + +:root .chroma .py { + color: inherit +} + +:root .chroma .nt { + color: #ff79c6 +} + +:root .chroma .nv { + color: #8be9fd; + font-style: italic +} + +:root .chroma .vc { + color: #8be9fd; + font-style: italic +} + +:root .chroma .vg { + color: #8be9fd; + font-style: italic +} + +:root .chroma .vi { + color: #8be9fd; + font-style: italic +} + +:root .chroma .vm { + color: inherit +} + +:root .chroma .l { + color: inherit +} + +:root .chroma .ld { + color: inherit +} + +:root .chroma .s { + color: #f1fa8c +} + +:root .chroma .sa { + color: #f1fa8c +} + +:root .chroma .sb { + color: #f1fa8c +} + +:root .chroma .sc { + color: #f1fa8c +} + +:root .chroma .dl { + color: #f1fa8c +} + +:root .chroma .sd { + color: #f1fa8c +} + +:root .chroma .s2 { + color: #f1fa8c +} + +:root .chroma .se { + color: #f1fa8c +} + +:root .chroma .sh { + color: #f1fa8c +} + +:root .chroma .si { + color: #f1fa8c +} + +:root .chroma .sx { + color: #f1fa8c +} + +:root .chroma .sr { + color: #f1fa8c +} + +:root .chroma .s1 { + color: #f1fa8c +} + +:root .chroma .ss { + color: #f1fa8c +} + +:root .chroma .m { + color: #bd93f9 +} + +:root .chroma .mb { + color: #bd93f9 +} + +:root .chroma .mf { + color: #bd93f9 +} + +:root .chroma .mh { + color: #bd93f9 +} + +:root .chroma .mi { + color: #bd93f9 +} + +:root .chroma .il { + color: #bd93f9 +} + +:root .chroma .mo { + color: #bd93f9 +} + +:root .chroma .o { + color: #ff79c6 +} + +:root .chroma .ow { + color: #ff79c6 +} + +:root .chroma .p { + color: inherit +} + +:root .chroma .c { + color: #96a6d8 +} + +:root .chroma .ch { + color: #96a6d8 +} + +:root .chroma .cm { + color: #96a6d8 +} + +:root .chroma .c1 { + color: #96a6d8 +} + +:root .chroma .cs { + color: #96a6d8 +} + +:root .chroma .cp { + color: #ff79c6 +} + +:root .chroma .cpf { + color: #ff79c6 +} + +:root .chroma .g { + color: inherit +} + +:root .chroma .gd { + color: #d98f90 +} + +:root .chroma .ge { + text-decoration: underline +} + +:root .chroma .gr { + color: inherit +} + +:root .chroma .gh { + font-weight: bold; + color: inherit +} + +:root .chroma .gi { + font-weight: bold +} + +:root .chroma .go { + color: #8f9ea8 +} + +:root .chroma .gp { + color: inherit +} + +:root .chroma .gs { + color: inherit +} + +:root .chroma .gu { + font-weight: bold +} + +:root .chroma .gt { + color: inherit +} + +:root .chroma .gl { + text-decoration: underline +} + +:root .chroma .w { + color: inherit +} + + @media (prefers-color-scheme: light) { + :root { + --header-background: #1a1a1a; + --header-font-color: #ffffff; + + --body-background: #1a1a1a; + --body-font-color: #ced3d8; + + --mark-color: #ffab00; + + --button-background: #be69d8; + --button-border-color: #be69d8; + + --link-color: #be69d8; + --link-color-visited: #c27a9e; + + --code-background: #3c393d; + --code-accent-color: #be69d8; + --code-accent-color-lite: #d896ec; + --code-font-color: #eeeded; + + --code-copy-background: #343a40; + --code-copy-font-color: #6b7784; + --code-copy-border-color: #6b7784; + --code-copy-success-color: #be69d8; + + --accent-color: #2b3035; + --accent-color-lite: #2f353a; + + --control-icons: #b2bac1; + + --footer-background: #1a1a1a; + --footer-font-color: #ffffff; + --footer-link-color: #be69d8; + --footer-link-color-visited: #be69d8; + } + + + :root .dark-mode-dim .gdoc-markdown img { + filter: brightness(0.75) grayscale(0.2) + } + + :root .gdoc-markdown .gdoc-hint, + :root .gdoc-markdown .gdoc-props__tag, + :root .gdoc-markdown .admonitionblock { + filter: saturate(2.5) brightness(0.85) + } + + :root .gdoc-markdown .gdoc-hint a, + :root .gdoc-markdown .admonitionblock a { + color: var(--hint-link-color) + } + + :root .gdoc-markdown .gdoc-hint a:visited, + :root .gdoc-markdown .admonitionblock a:visited { + color: var(--hint-link-color-visited) + } + + :root .gdoc-markdown .gdoc-hint__title, + :root .gdoc-markdown .admonitionblock table td:first-child { + background-color: rgba(134, 142, 150, .15) + } + + :root .chroma { + color: var(--code-font-color) + } + + :root .chroma .lntable td:nth-child(2) code .hl { + width: auto; + margin-left: -0.5em; + padding: 0 .5em + } + + :root .highlight pre.chroma { + width: 100%; + overflow: auto; + max-height: var(--code-max-height) + } + + :root .chroma .lntable { + border: 1px solid var(--code-accent-color); + border-radius: .15rem; + border-spacing: 0; + padding: 0; + margin: 0; + width: 100%; + display: block; + max-height: var(--code-max-height); + overflow: auto + } + + :root .chroma .lntable pre.chroma { + max-height: none; + border-radius: 0; + margin: 0 + } + + :root .chroma .lntable td:first-child code { + background-color: var(--code-accent-color-lite); + border-right: 1px solid var(--code-accent-color); + padding-left: 0; + padding-right: 0; + border-radius: 0 + } + + :root .chroma .lntable td:nth-child(2) { + width: 100%; + margin-left: 2rem + } + + :root .chroma .x { + color: inherit + } + + :root .chroma .err { + color: inherit + } + + :root .chroma .lntd { + vertical-align: top; + padding: 0; + margin: 0; + border: 0 + } + + :root .chroma .hl { + display: block; + width: 100%; + background-color: #4f1605 + } + + :root .chroma .lnt { + padding: 0 .8em + } + + :root .chroma .ln { + margin-right: .4em; + padding: 0 .4em 0 .4em; + color: #b3b3b3 + } + + :root .chroma .k { + color: #ff79c6 + } + + :root .chroma .kc { + color: #ff79c6 + } + + :root .chroma .kd { + color: #8be9fd; + font-style: italic + } + + :root .chroma .kn { + color: #ff79c6 + } + + :root .chroma .kp { + color: #ff79c6 + } + + :root .chroma .kr { + color: #ff79c6 + } + + :root .chroma .kt { + color: #8be9fd + } + + :root .chroma .n { + color: inherit + } + + :root .chroma .na { + color: #50fa7b + } + + :root .chroma .nb { + color: #8be9fd; + font-style: italic + } + + :root .chroma .bp { + color: inherit + } + + :root .chroma .nc { + color: #50fa7b + } + + :root .chroma .no { + color: inherit + } + + :root .chroma .nd { + color: inherit + } + + :root .chroma .ni { + color: inherit + } + + :root .chroma .ne { + color: inherit + } + + :root .chroma .nf { + color: #50fa7b + } + + :root .chroma .fm { + color: inherit + } + + :root .chroma .nl { + color: #8be9fd; + font-style: italic + } + + :root .chroma .nn { + color: inherit + } + + :root .chroma .nx { + color: inherit + } + + :root .chroma .py { + color: inherit + } + + :root .chroma .nt { + color: #ff79c6 + } + + :root .chroma .nv { + color: #8be9fd; + font-style: italic + } + + :root .chroma .vc { + color: #8be9fd; + font-style: italic + } + + :root .chroma .vg { + color: #8be9fd; + font-style: italic + } + + :root .chroma .vi { + color: #8be9fd; + font-style: italic + } + + :root .chroma .vm { + color: inherit + } + + :root .chroma .l { + color: inherit + } + + :root .chroma .ld { + color: inherit + } + + :root .chroma .s { + color: #f1fa8c + } + + :root .chroma .sa { + color: #f1fa8c + } + + :root .chroma .sb { + color: #f1fa8c + } + + :root .chroma .sc { + color: #f1fa8c + } + + :root .chroma .dl { + color: #f1fa8c + } + + :root .chroma .sd { + color: #f1fa8c + } + + :root .chroma .s2 { + color: #f1fa8c + } + + :root .chroma .se { + color: #f1fa8c + } + + :root .chroma .sh { + color: #f1fa8c + } + + :root .chroma .si { + color: #f1fa8c + } + + :root .chroma .sx { + color: #f1fa8c + } + + :root .chroma .sr { + color: #f1fa8c + } + + :root .chroma .s1 { + color: #f1fa8c + } + + :root .chroma .ss { + color: #f1fa8c + } + + :root .chroma .m { + color: #bd93f9 + } + + :root .chroma .mb { + color: #bd93f9 + } + + :root .chroma .mf { + color: #bd93f9 + } + + :root .chroma .mh { + color: #bd93f9 + } + + :root .chroma .mi { + color: #bd93f9 + } + + :root .chroma .il { + color: #bd93f9 + } + + :root .chroma .mo { + color: #bd93f9 + } + + :root .chroma .o { + color: #ff79c6 + } + + :root .chroma .ow { + color: #ff79c6 + } + + :root .chroma .p { + color: inherit + } + + :root .chroma .c { + color: #96a6d8 + } + + :root .chroma .ch { + color: #96a6d8 + } + + :root .chroma .cm { + color: #96a6d8 + } + + :root .chroma .c1 { + color: #96a6d8 + } + + :root .chroma .cs { + color: #96a6d8 + } + + :root .chroma .cp { + color: #ff79c6 + } + + :root .chroma .cpf { + color: #ff79c6 + } + + :root .chroma .g { + color: inherit + } + + :root .chroma .gd { + color: #d98f90 + } + + :root .chroma .ge { + text-decoration: underline + } + + :root .chroma .gr { + color: inherit + } + + :root .chroma .gh { + font-weight: bold; + color: inherit + } + + :root .chroma .gi { + font-weight: bold + } + + :root .chroma .go { + color: #8f9ea8 + } + + :root .chroma .gp { + color: inherit + } + + :root .chroma .gs { + color: inherit + } + + :root .chroma .gu { + font-weight: bold + } + + :root .chroma .gt { + color: inherit + } + + :root .chroma .gl { + text-decoration: underline + } + + :root .chroma .w { + color: inherit + } + + } + + /* Dark mode theming */ + :root[color-theme="dark"] { + --header-background: #1a1a1a; + --header-font-color: #ffffff; + + --body-background: #1a1a1a; + --body-font-color: #ced3d8; + + --mark-color: #ffab00; + + --button-background: #be69d8; + --button-border-color: #be69d8; + + --link-color: #be69d8; + --link-color-visited: #c27a9e; + + --code-background: #282629; + --code-accent-color: #be69d8; + --code-accent-color-lite: #d896ec; + --code-font-color: #eeeded; + + --code-copy-background: #d189cd; + --code-copy-font-color: #6b7784; + --code-copy-border-color: #6b7784; + --code-copy-success-color: #be69d8; + + --accent-color: #2b3035; + --accent-color-lite: #2f353a; + + --control-icons: #b2bac1; + + --footer-background: #1a1a1a; + --footer-font-color: #ffffff; + --footer-link-color: #be69d8; + --footer-link-color-visited: #be69d8; + } + @media (prefers-color-scheme: dark) { + :root { + --header-background: #1a1a1a; + --header-font-color: #ffffff; + + --body-background: #1a1a1a; + --body-font-color: #ced3d8; + + --mark-color: #ffab00; + + --button-background: #be69d8; + --button-border-color: #be69d8; + + --link-color: #be69d8; + --link-color-visited: #c27a9e; + + --code-background: #3c393d; + --code-accent-color: #be69d8; + --code-accent-color-lite: #d896ec; + --code-font-color: #eeeded; + + --code-copy-background: #d189cd; + --code-copy-font-color: #6b7784; + --code-copy-border-color: #6b7784; + --code-copy-success-color: #be69d8; + + --accent-color: #2b3035; + --accent-color-lite: #2f353a; + + --control-icons: #b2bac1; + + --footer-background: #1a1a1a; + --footer-font-color: #ffffff; + --footer-link-color: #be69d8; + --footer-link-color-visited: #be69d8; + } + } \ No newline at end of file diff --git a/dataphos-docs/static/dataphos.png b/dataphos-docs/static/dataphos.png new file mode 100644 index 0000000..198bdfb Binary files /dev/null and b/dataphos-docs/static/dataphos.png differ diff --git a/dataphos-docs/static/favicon/favicon-16x16.png b/dataphos-docs/static/favicon/favicon-16x16.png new file mode 100644 index 0000000..32f71a5 Binary files /dev/null and b/dataphos-docs/static/favicon/favicon-16x16.png differ diff --git a/dataphos-docs/static/favicon/favicon-32x32.png b/dataphos-docs/static/favicon/favicon-32x32.png new file mode 100644 index 0000000..dde94fb Binary files /dev/null and b/dataphos-docs/static/favicon/favicon-32x32.png differ diff --git a/dataphos-docs/static/favicon/favicon.svg b/dataphos-docs/static/favicon/favicon.svg new file mode 100644 index 0000000..2687467 --- /dev/null +++ b/dataphos-docs/static/favicon/favicon.svg @@ -0,0 +1,46 @@ + + + + diff --git a/dataphos-docs/static/home.PNG b/dataphos-docs/static/home.PNG new file mode 100644 index 0000000..9485bb1 Binary files /dev/null and b/dataphos-docs/static/home.PNG differ diff --git a/dataphos-docs/static/instance_n.PNG b/dataphos-docs/static/instance_n.PNG new file mode 100644 index 0000000..733c115 Binary files /dev/null and b/dataphos-docs/static/instance_n.PNG differ diff --git a/dataphos-docs/static/landing.PNG b/dataphos-docs/static/landing.PNG new file mode 100644 index 0000000..1fc0ff0 Binary files /dev/null and b/dataphos-docs/static/landing.PNG differ diff --git a/dataphos-docs/static/lineage.png b/dataphos-docs/static/lineage.png new file mode 100644 index 0000000..960a004 Binary files /dev/null and b/dataphos-docs/static/lineage.png differ diff --git a/dataphos-docs/static/login.PNG b/dataphos-docs/static/login.PNG new file mode 100644 index 0000000..1c1cb13 Binary files /dev/null and b/dataphos-docs/static/login.PNG differ diff --git a/dataphos-docs/static/persistor.png b/dataphos-docs/static/persistor.png new file mode 100644 index 0000000..d937b58 Binary files /dev/null and b/dataphos-docs/static/persistor.png differ diff --git a/dataphos-docs/static/persistor_arch.png b/dataphos-docs/static/persistor_arch.png new file mode 100644 index 0000000..c1665c5 Binary files /dev/null and b/dataphos-docs/static/persistor_arch.png differ diff --git a/dataphos-docs/static/persistor_thumbnail_deployment.jpg b/dataphos-docs/static/persistor_thumbnail_deployment.jpg new file mode 100644 index 0000000..90728fd Binary files /dev/null and b/dataphos-docs/static/persistor_thumbnail_deployment.jpg differ diff --git a/dataphos-docs/static/persistor_thumbnail_overview.jpg b/dataphos-docs/static/persistor_thumbnail_overview.jpg new file mode 100644 index 0000000..95ddac9 Binary files /dev/null and b/dataphos-docs/static/persistor_thumbnail_overview.jpg differ diff --git a/dataphos-docs/static/publisher.png b/dataphos-docs/static/publisher.png new file mode 100644 index 0000000..ada6ffd Binary files /dev/null and b/dataphos-docs/static/publisher.png differ diff --git a/dataphos-docs/static/publisher_thumbnail_deployment.jpg b/dataphos-docs/static/publisher_thumbnail_deployment.jpg new file mode 100644 index 0000000..9dec2dd Binary files /dev/null and b/dataphos-docs/static/publisher_thumbnail_deployment.jpg differ diff --git a/dataphos-docs/static/publisher_thumbnail_overview.jpg b/dataphos-docs/static/publisher_thumbnail_overview.jpg new file mode 100644 index 0000000..37446b1 Binary files /dev/null and b/dataphos-docs/static/publisher_thumbnail_overview.jpg differ diff --git a/dataphos-docs/static/queries.PNG b/dataphos-docs/static/queries.PNG new file mode 100644 index 0000000..e40ed5f Binary files /dev/null and b/dataphos-docs/static/queries.PNG differ diff --git a/dataphos-docs/static/runs_n.PNG b/dataphos-docs/static/runs_n.PNG new file mode 100644 index 0000000..6087850 Binary files /dev/null and b/dataphos-docs/static/runs_n.PNG differ diff --git a/dataphos-docs/static/scheduler.png b/dataphos-docs/static/scheduler.png new file mode 100644 index 0000000..17ae00f Binary files /dev/null and b/dataphos-docs/static/scheduler.png differ diff --git a/dataphos-docs/static/sr.png b/dataphos-docs/static/sr.png new file mode 100644 index 0000000..b83384e Binary files /dev/null and b/dataphos-docs/static/sr.png differ diff --git a/dataphos-docs/static/sr_architecture.png b/dataphos-docs/static/sr_architecture.png new file mode 100644 index 0000000..7a20737 Binary files /dev/null and b/dataphos-docs/static/sr_architecture.png differ diff --git a/dataphos-docs/static/sr_thumbnail_deployment.jpg b/dataphos-docs/static/sr_thumbnail_deployment.jpg new file mode 100644 index 0000000..81b7504 Binary files /dev/null and b/dataphos-docs/static/sr_thumbnail_deployment.jpg differ diff --git a/dataphos-docs/static/sr_thumbnail_overview.jpg b/dataphos-docs/static/sr_thumbnail_overview.jpg new file mode 100644 index 0000000..d907de9 Binary files /dev/null and b/dataphos-docs/static/sr_thumbnail_overview.jpg differ diff --git a/dataphos-docs/static/webcli.PNG b/dataphos-docs/static/webcli.PNG new file mode 100644 index 0000000..ef12138 Binary files /dev/null and b/dataphos-docs/static/webcli.PNG differ diff --git a/dataphos-docs/static/worker.png b/dataphos-docs/static/worker.png new file mode 100644 index 0000000..87c7e9a Binary files /dev/null and b/dataphos-docs/static/worker.png differ diff --git a/dataphos-docs/themes/hugo-geekdoc b/dataphos-docs/themes/hugo-geekdoc new file mode 160000 index 0000000..0e8f753 --- /dev/null +++ b/dataphos-docs/themes/hugo-geekdoc @@ -0,0 +1 @@ +Subproject commit 0e8f75346668d358061854b7559f75fe7f92917f diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..352fb61 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,3 @@ +# Dataphos Deployment Examples + +This contains all of the pre-populated and/or example files used in the official documentation. \ No newline at end of file diff --git a/examples/persistor/persistor-gcp.yaml b/examples/persistor/persistor-gcp.yaml new file mode 100644 index 0000000..0858fbf --- /dev/null +++ b/examples/persistor/persistor-gcp.yaml @@ -0,0 +1,322 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: mongo + namespace: dataphos +spec: + selector: + matchLabels: + role: mongo + serviceName: mongo-service + template: + metadata: + labels: + role: mongo + spec: + terminationGracePeriodSeconds: 10 + containers: + - name: mongo + image: mongo:4.0 + command: + - mongod + - "--bind_ip" + - 0.0.0.0 + - "--smallfiles" + - "--noprealloc" + ports: + - containerPort: 27017 + volumeMounts: + - name: mongo-persistent-volume + mountPath: /data/db + volumeClaimTemplates: + - metadata: + name: mongo-persistent-volume + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 1Gi +--- +apiVersion: v1 +kind: Service +metadata: + name: mongo-service + namespace: dataphos + labels: + name: mongo +spec: + ports: + - port: 27017 + targetPort: 27017 + clusterIP: None + selector: + role: mongo +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: pes-config + namespace: dataphos +data: + READER_TYPE: "pubsub" + SENDER_TYPE: "pubsub" + INDEXERENABLED: "true" + DEADLETTERENABLED: "true" + READER_PUBSUB_PROJECTID: "" # change this + READER_PUBSUB_SUBID: "" # change this + STORAGE_TYPE: "gcs" + STORAGE_PREFIX: "msg" + STORAGE_MSGEXTENSION: "avro" + STORAGE_MASK: "year/month/day/hour" + STORAGE_CUSTOMVALUES: "" + STORAGE_DESTINATION: "" # change this + STORAGE_TOPICID: "" # change this + SENDER_TOPICID: "" # change this + SENDER_DEADLETTERTOPIC: "" # change this + SENDER_PUBSUB_PROJECTID: "" # change this + BATCHSETTINGS_BATCHSIZE: "5000" + BATCHSETTINGS_BATCHTIMEOUT: "30s" + BATCHSETTINGS_BATCHMEMORY: "1000000" + MINIMUM_LOG_LEVEL: "INFO" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: persistor + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: persistor + template: + metadata: + labels: + app: persistor + spec: + volumes: + - name: google-cloud-key + secret: + secretName: per-gcp-access + containers: + - name: gcp-persistor + image: syntioinc/dataphos-persistor-core:1.0.0 + volumeMounts: + - mountPath: /var/secrets/google + name: google-cloud-key + ports: + - containerPort: 2112 + envFrom: + - configMapRef: + name: pes-config + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /var/secrets/google/key.json +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: idx-config + namespace: dataphos +data: + READER_TYPE: "pubsub" + SENDER_TYPE: "pubsub" + DEADLETTERENABLED: "true" + + READER_PUBSUB_PROJECTID: "" # change this + READER_PUBSUB_SUBID: "" # change this + + MONGO_CONNECTIONSTRING: "mongodb://mongo-0.mongo-service.dataphos:27017" + MONGO_DATABASE: "indexer_db" + MONGO_COLLECTION: "indexer_collection" + + SENDER_DEADLETTERTOPIC: "" # change this + SENDER_PUBSUB_PROJECTID: "" # change this + + BATCHSETTINGS_BATCHSIZE: "5000" + BATCHSETTINGS_BATCHTIMEOUT: "30s" + BATCHSETTINGS_BATCHMEMORY: "1000000" + MINIMUM_LOG_LEVEL: "INFO" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: indexer + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: indexer + template: + metadata: + labels: + app: indexer + spec: + volumes: + - name: google-cloud-key + secret: + secretName: per-gcp-access + containers: + - name: indexer + image: syntioinc/dataphos-persistor-indexer:1.0.0 + volumeMounts: + - mountPath: /var/secrets/google + name: google-cloud-key + ports: + - containerPort: 2112 + envFrom: + - configMapRef: + name: idx-config + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /var/secrets/google/key.json +--- +apiVersion: v1 +kind: Service +metadata: + name: persistor-metrics-svc + namespace: dataphos + labels: + app: persistor +spec: + type: LoadBalancer + ports: + - port: 2112 + selector: + app: persistor +--- +apiVersion: v1 +kind: Service +metadata: + name: indexer-metrics-svc + namespace: dataphos + labels: + app: indexer +spec: + type: LoadBalancer + ports: + - port: 2112 + selector: + app: indexer +--- +apiVersion: v1 +kind: Service +metadata: + name: indexer-api-svc + namespace: dataphos + labels: + app: indexer-api +spec: + type: LoadBalancer + ports: + - port: 8080 + selector: + app: indexer-api +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: idx-api-config + namespace: dataphos +data: + CONN: "mongodb://mongo-0.mongo-service.dataphos:27017" + DATABASE: "indexer_db" + MINIMUM_LOG_LEVEL: "WARN" + SERVER_ADDRESS: ":8080" + USE_TLS: "false" + SERVER_TIMEOUT: "10s" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: indexer-api-deployment + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: indexer-api + template: + metadata: + labels: + app: indexer-api + spec: + containers: + - name: indexer-api + image: syntioinc/dataphos-persistor-indexer-api:1.0.0 + envFrom: + - configMapRef: + name: idx-api-config +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: rsb-config + namespace: dataphos +data: + PUBSUB_PROJECT_ID: "" # change this + INDEXER_URL: "http://indexer-api-svc:8080" + MINIMUM_LOG_LEVEL: "WARN" + SERVER_ADDRESS: ":8081" + USE_TLS: "false" + SERVER_TIMEOUT: "10s" + RSB_META_CAPACITY: "20000" + RSB_FETCH_CAPACITY: "200" + RSB_WORKER_NUM: "3" + RSB_ENABLE_MESSAGE_ORDERING: "false" + STORAGE_TYPE: "gcs" # Do not change! + PUBLISHER_TYPE: "pubsub" # Do not change! + PUBLISH_TIMEOUT: "15s" + PUBLISH_COUNT_THRESHOLD: "50" + PUBLISH_DELAY_THRESHOLD: "50ms" + NUM_PUBLISH_GOROUTINES: "10" + MAX_PUBLISH_OUTSTANDING_MESSAGES: "800" + MAX_PUBLISH_OUTSTANDING_BYTES: "1048576000" + PUBLISH_ENABLE_MESSAGE_ORDERING: "false" +--- +apiVersion: v1 +kind: Service +metadata: + name: resubmitter-svc + namespace: dataphos + labels: + app: resubmitter +spec: + type: LoadBalancer + ports: + - port: 8081 + selector: + app: resubmitter +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: resubmitter-deployment + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: resubmitter + template: + metadata: + labels: + app: resubmitter + spec: + volumes: + - name: google-cloud-key + secret: + secretName: per-gcp-access + containers: + - name: resubmitter + image: syntioinc/dataphos-persistor-resubmitter:1.0.0 + volumeMounts: + - mountPath: /var/secrets/google + name: google-cloud-key + envFrom: + - configMapRef: + name: rsb-config + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /var/secrets/google/key.json diff --git a/examples/persistor/persistor_azure.yaml b/examples/persistor/persistor_azure.yaml new file mode 100644 index 0000000..e8aa824 --- /dev/null +++ b/examples/persistor/persistor_azure.yaml @@ -0,0 +1,291 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: mongo + namespace: dataphos +spec: + selector: + matchLabels: + role: mongo + serviceName: mongo-service + template: + metadata: + labels: + role: mongo + spec: + terminationGracePeriodSeconds: 10 + containers: + - name: mongo + image: mongo:4.0 + command: + - mongod + - "--bind_ip" + - 0.0.0.0 + - "--smallfiles" + - "--noprealloc" + ports: + - containerPort: 27017 + volumeMounts: + - name: mongo-persistent-volume + mountPath: /data/db + volumeClaimTemplates: + - metadata: + name: mongo-persistent-volume + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 1Gi +--- +apiVersion: v1 +kind: Service +metadata: + name: mongo-service + namespace: dataphos + labels: + name: mongo +spec: + ports: + - port: 27017 + targetPort: 27017 + clusterIP: None + selector: + role: mongo +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: pes-config + namespace: dataphos +data: + READER_TYPE: "servicebus" + SENDER_TYPE: "servicebus" + INDEXERENABLED: "true" + DEADLETTERENABLED: "true" + READER_SERVICEBUS_CONNECTIONSTRING: "" # change this + READER_SERVICEBUS_TOPICID: "" # change this - must be equal to STORAGE_TOPICID + READER_SERVICEBUS_SUBID: "" # change this + STORAGE_TYPE: "abs" + STORAGE_PREFIX: "msg" + STORAGE_MSGEXTENSION: "avro" + STORAGE_MASK: "year/month/day/hour" + STORAGE_CUSTOMVALUES: "" + STORAGE_DESTINATION: "" # change this + STORAGE_TOPICID: "" # change this + STORAGE_STORAGEACCOUNTID: "" # change this + SENDER_TOPICID: "" # change this + SENDER_DEADLETTERTOPIC: "" # change this + SENDER_SERVICEBUS_CONNECTIONSTRING: "" # change this + BATCHSETTINGS_BATCHSIZE: "5000" + BATCHSETTINGS_BATCHTIMEOUT: "30s" + BATCHSETTINGS_BATCHMEMORY: "1000000" + AZURE_CLIENT_ID: "" # change this + AZURE_TENANT_ID: "" # change this + AZURE_CLIENT_SECRET: "" # change this + MINIMUM_LOG_LEVEL: "INFO" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: persistor + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: persistor + template: + metadata: + labels: + app: persistor + spec: + containers: + - name: azure-persistor + image: syntioinc/dataphos-persistor-core:1.0.0 + ports: + - containerPort: 2112 + envFrom: + - configMapRef: + name: pes-config +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: idx-config + namespace: dataphos +data: + READER_TYPE: "servicebus" + SENDER_TYPE: "servicebus" + DEADLETTERENABLED: "true" + READER_SERVICEBUS_CONNECTIONSTRING: "" # change this + READER_SERVICEBUS_TOPICID: "" # change this + READER_SERVICEBUS_SUBID: "" # change this + SENDER_DEADLETTERTOPIC: "" # change this + SENDER_SERVICEBUS_CONNECTIONSTRING: "" # change this + MONGO_CONNECTIONSTRING: "mongodb://mongo-0.mongo-service.dataphos:27017" + MONGO_DATABASE: "indexer_db" + MONGO_COLLECTION: "indexer_collection" + BATCHSETTINGS_BATCHSIZE: "5000" + BATCHSETTINGS_BATCHTIMEOUT: "30s" + BATCHSETTINGS_BATCHMEMORY: "1000000" + MINIMUM_LOG_LEVEL: "INFO" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: indexer + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: indexer + template: + metadata: + labels: + app: indexer + spec: + containers: + - name: indexer + image: syntioinc/dataphos-persistor-indexer:1.0.0 + ports: + - containerPort: 2112 + envFrom: + - configMapRef: + name: idx-config +--- +apiVersion: v1 +kind: Service +metadata: + name: persistor-metrics-svc + namespace: dataphos + labels: + app: persistor +spec: + type: LoadBalancer + ports: + - port: 2112 + selector: + app: persistor +--- +apiVersion: v1 +kind: Service +metadata: + name: indexer-metrics-svc + namespace: dataphos + labels: + app: indexer +spec: + type: LoadBalancer + ports: + - port: 2112 + selector: + app: indexer +--- +apiVersion: v1 +kind: Service +metadata: + name: indexer-api-svc + namespace: dataphos + labels: + app: indexer-api +spec: + type: LoadBalancer + ports: + - port: 8080 + selector: + app: indexer-api +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: idx-api-config + namespace: dataphos +data: + CONN: "mongodb://mongo-0.mongo-service.dataphos:27017" + DATABASE: "indexer_db" + MINIMUM_LOG_LEVEL: "INFO" + SERVER_ADDRESS: ":8080" + USE_TLS: "false" + SERVER_TIMEOUT: "2s" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: indexer-api-deployment + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: indexer-api + template: + metadata: + labels: + app: indexer-api + spec: + containers: + - name: indexer-api + image: syntioinc/dataphos-persistor-indexer-api:1.0.0 + envFrom: + - configMapRef: + name: idx-api-config +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: rsb-config + namespace: dataphos +data: + INDEXER_URL: http://indexer-api-svc:8080 + AZURE_CLIENT_ID: "" # change this + AZURE_TENANT_ID: "" # change this + AZURE_CLIENT_SECRET: "" # change this + SB_CONNECTION_STRING: "" # change this + AZURE_STORAGE_ACCOUNT_NAME: "" # change this + MINIMUM_LOG_LEVEL: "INFO" + STORAGE_TYPE: "abs" # Do not change! + PUBLISHER_TYPE: "servicebus" # Do not change! + SERVER_ADDRESS: ":8081" + USE_TLS: "false" + SERVER_TIMEOUT: "2s" + RSB_META_CAPACITY: "20000" + RSB_FETCH_CAPACITY: "200" + RSB_WORKER_NUM: "3" + RSB_ENABLE_MESSAGE_ORDERING: "false" +--- +apiVersion: v1 +kind: Service +metadata: + name: resubmitter-svc + namespace: dataphos + labels: + app: resubmitter +spec: + type: LoadBalancer + ports: + - port: 8081 + selector: + app: resubmitter +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: resubmitter-deployment + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: resubmitter + template: + metadata: + labels: + app: resubmitter + spec: + containers: + - name: resubmitter + image: syntioinc/dataphos-persistor-resubmitter:1.0.0 + envFrom: + - configMapRef: + name: rsb-config diff --git a/examples/persistor/persistor_kafka_az_blob.yaml b/examples/persistor/persistor_kafka_az_blob.yaml new file mode 100644 index 0000000..ac571b0 --- /dev/null +++ b/examples/persistor/persistor_kafka_az_blob.yaml @@ -0,0 +1,307 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: mongo + namespace: dataphos +spec: + selector: + matchLabels: + role: mongo + serviceName: mongo-service + template: + metadata: + labels: + role: mongo + spec: + terminationGracePeriodSeconds: 10 + containers: + - name: mongo + image: mongo:4.0 + command: + - mongod + - "--bind_ip" + - 0.0.0.0 + - "--smallfiles" + - "--noprealloc" + ports: + - containerPort: 27017 + volumeMounts: + - name: mongo-persistent-volume + mountPath: /data/db + volumeClaimTemplates: + - metadata: + name: mongo-persistent-volume + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 1Gi +--- +apiVersion: v1 +kind: Service +metadata: + name: mongo-service + namespace: dataphos + labels: + name: mongo +spec: + ports: + - port: 27017 + targetPort: 27017 + clusterIP: None + selector: + role: mongo +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: pes-config + namespace: dataphos +data: + READER_TYPE: "kafka" + SENDER_TYPE: "kafka" + INDEXERENABLED: "true" + DEADLETTERENABLED: "true" + READER_KAFKA_TOPICID: "" # change this - must be equal to STORAGE_TOPICID + READER_KAFKA_ADDRESS: "" # change this + READER_KAFKA_GROUPID: "" # change this + STORAGE_TYPE: "abs" + STORAGE_PREFIX: "msg" + STORAGE_MSGEXTENSION: "avro" + STORAGE_MASK: "year/month/day/hour" + STORAGE_CUSTOMVALUES: "" + STORAGE_DESTINATION: "" # change this + STORAGE_TOPICID: "" # change this + STORAGE_STORAGEACCOUNTID: "" # change this + SENDER_TOPICID: "" # change this + SENDER_DEADLETTERTOPIC: "" # change this + SENDER_KAFKA_ADDRESS: "" # change this + BATCHSETTINGS_BATCHSIZE: "5000" + BATCHSETTINGS_BATCHTIMEOUT: "30s" + BATCHSETTINGS_BATCHMEMORY: "1000000" + AZURE_CLIENT_ID: "" # change this + AZURE_TENANT_ID: "" # change this + AZURE_CLIENT_SECRET: "" # change this + MINIMUM_LOG_LEVEL: "INFO" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: persistor + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: persistor + template: + metadata: + labels: + app: persistor + spec: + containers: + - name: azure-persistor + image: syntioinc/dataphos-persistor-core:1.0.0 + ports: + - containerPort: 2112 + envFrom: + - configMapRef: + name: pes-config +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: idx-config + namespace: dataphos +data: + READER_TYPE: "kafka" + SENDER_TYPE: "kafka" + DEADLETTERENABLED: "true" + READER_KAFKA_TOPICID: "" # change this + READER_KAFKA_ADDRESS: "" # change this + READER_KAFKA_GROUPID: "" # change this + SENDER_DEADLETTERTOPIC: "" # change this + MONGO_CONNECTIONSTRING: "mongodb://mongo-0.mongo-service.dataphos:27017" + MONGO_DATABASE: "indexer_db" + MONGO_COLLECTION: "indexer_collection" + SENDER_KAFKA_ADDRESS: "" # change this + BATCHSETTINGS_BATCHSIZE: "5000" + BATCHSETTINGS_BATCHTIMEOUT: "30s" + BATCHSETTINGS_BATCHMEMORY: "1000000" + MINIMUM_LOG_LEVEL: "INFO" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: indexer + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: indexer + template: + metadata: + labels: + app: indexer + spec: + containers: + - name: indexer + image: syntioinc/dataphos-persistor-indexer:1.0.0 + ports: + - containerPort: 2112 + envFrom: + - configMapRef: + name: idx-config +--- +apiVersion: v1 +kind: Service +metadata: + name: persistor-metrics-svc + namespace: dataphos + labels: + app: persistor +spec: + type: LoadBalancer + ports: + - port: 2112 + selector: + app: persistor +--- +apiVersion: v1 +kind: Service +metadata: + name: indexer-metrics-svc + namespace: dataphos + labels: + app: indexer +spec: + type: LoadBalancer + ports: + - port: 2112 + selector: + app: indexer +--- +apiVersion: v1 +kind: Service +metadata: + name: indexer-api-svc + namespace: dataphos + labels: + app: indexer-api +spec: + type: LoadBalancer + ports: + - port: 8080 + selector: + app: indexer-api +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: idx-api-config + namespace: dataphos +data: + CONN: "mongodb://mongo-0.mongo-service.dataphos:27017" + DATABASE: "indexer_db" + MINIMUM_LOG_LEVEL: "INFO" + SERVER_ADDRESS: ":8080" + USE_TLS: "false" + SERVER_TIMEOUT: "2s" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: indexer-api-deployment + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: indexer-api + template: + metadata: + labels: + app: indexer-api + spec: + containers: + - name: indexer-api + image: syntioinc/dataphos-persistor-indexer-api:1.0.0 + envFrom: + - configMapRef: + name: idx-api-config +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: rsb-config + namespace: dataphos +data: + MINIMUM_LOG_LEVEL: "INFO" + INDEXER_URL: http://indexer-api-svc:8080 + STORAGE_TYPE: "abs" # Do not change! + PUBLISHER_TYPE: "kafka" # Do not change! + SERVER_ADDRESS: ":8081" + USE_TLS: "false" + SERVER_TIMEOUT: "10s" + RSB_META_CAPACITY: "20000" + RSB_FETCH_CAPACITY: "200" + RSB_WORKER_NUM: "3" + RSB_ENABLE_MESSAGE_ORDERING: "false" + AZURE_CLIENT_ID: "" # change this + AZURE_TENANT_ID: "" # change this + AZURE_CLIENT_SECRET: "" # change this + AZURE_STORAGE_ACCOUNT_NAME: "" # change this + KAFKA_BROKERS: "" # change this + KAFKA_USE_TLS: "false" + KAFKA_USE_SASL: "false" + SASL_USERNAME: "default" + SASL_PASSWORD: "default" + KAFKA_SKIP_VERIFY: "false" + KAFKA_DISABLE_COMPRESSION: "false" + KAFKA_BATCH_SIZE: "50" + KAFKA_BATCH_BYTES: "52428800" + KAFKA_BATCH_TIMEOUT: "10ms" + ENABLE_KERBEROS: "false" + KRB_CONFIG_PATH: "/path/to/config/file" + KRB_REALM: "REALM.com" + KRB_SERVICE_NAME: "kerberos-service" + KRB_KEY_TAB: "/path/to/file.keytab" + KRB_USERNAME: "user" +--- +apiVersion: v1 +kind: Service +metadata: + name: resubmitter-svc + namespace: dataphos + labels: + app: resubmitter +spec: + type: LoadBalancer + ports: + - port: 8081 + selector: + app: resubmitter +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: resubmitter-deployment + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: resubmitter + template: + metadata: + labels: + app: resubmitter + spec: + containers: + - name: resubmitter + image: syntioinc/dataphos-persistor-resubmitter:1.0.0 + envFrom: + - configMapRef: + name: rsb-config + diff --git a/examples/persistor/persistor_kafka_gcs.yaml b/examples/persistor/persistor_kafka_gcs.yaml new file mode 100644 index 0000000..e63287f --- /dev/null +++ b/examples/persistor/persistor_kafka_gcs.yaml @@ -0,0 +1,329 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: mongo + namespace: dataphos +spec: + selector: + matchLabels: + role: mongo + serviceName: mongo-service + template: + metadata: + labels: + role: mongo + spec: + terminationGracePeriodSeconds: 10 + containers: + - name: mongo + image: mongo:4.0 + command: + - mongod + - "--bind_ip" + - 0.0.0.0 + - "--smallfiles" + - "--noprealloc" + ports: + - containerPort: 27017 + volumeMounts: + - name: mongo-persistent-volume + mountPath: /data/db + volumeClaimTemplates: + - metadata: + name: mongo-persistent-volume + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 1Gi +--- +apiVersion: v1 +kind: Service +metadata: + name: mongo-service + namespace: dataphos + labels: + name: mongo +spec: + ports: + - port: 27017 + targetPort: 27017 + clusterIP: None + selector: + role: mongo +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: pes-config + namespace: dataphos +data: + READER_TYPE: "kafka" + SENDER_TYPE: "kafka" + INDEXERENABLED: "true" + DEADLETTERENABLED: "true" + READER_KAFKA_TOPICID: "" # change this - must be equal to STORAGE_TOPICID + READER_KAFKA_ADDRESS: "" # change this + READER_KAFKA_GROUPID: "" # change this + STORAGE_TYPE: "gcs" + STORAGE_PREFIX: "msg" + STORAGE_MSGEXTENSION: "avro" + STORAGE_MASK: "year/month/day/hour" + STORAGE_CUSTOMVALUES: "" + STORAGE_DESTINATION: "" # change this + STORAGE_TOPICID: "" # change this + SENDER_TOPICID: "" # change this + SENDER_DEADLETTERTOPIC: "" # change this + SENDER_KAFKA_ADDRESS: "" # change this + BATCHSETTINGS_BATCHSIZE: "5000" + BATCHSETTINGS_BATCHTIMEOUT: "30s" + BATCHSETTINGS_BATCHMEMORY: "1000000" + MINIMUM_LOG_LEVEL: "INFO" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: persistor + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: persistor + template: + metadata: + labels: + app: persistor + spec: + volumes: + - name: google-cloud-key + secret: + secretName: per-gcp-access + containers: + - name: gcp-persistor + image: syntioinc/dataphos-persistor-core:1.0.0 + volumeMounts: + - mountPath: /var/secrets/google + name: google-cloud-key + ports: + - containerPort: 2112 + envFrom: + - configMapRef: + name: pes-config + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /var/secrets/google/key.json +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: idx-config + namespace: dataphos +data: + READER_TYPE: "kafka" + SENDER_TYPE: "kafka" + DEADLETTERENABLED: "true" + READER_KAFKA_TOPICID: "" # change this + READER_KAFKA_ADDRESS: "" # change this + READER_KAFKA_GROUPID: "" # change this + SENDER_DEADLETTERTOPIC: "" # change this + MONGO_CONNECTIONSTRING: "mongodb://mongo-0.mongo-service.dataphos:27017" + MONGO_DATABASE: "indexer_db" + MONGO_COLLECTION: "indexer_collection" + SENDER_KAFKA_ADDRESS: "" # change this + BATCHSETTINGS_BATCHSIZE: "5000" + BATCHSETTINGS_BATCHTIMEOUT: "30s" + BATCHSETTINGS_BATCHMEMORY: "1000000" + MINIMUM_LOG_LEVEL: "INFO" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: indexer + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: indexer + template: + metadata: + labels: + app: indexer + spec: + volumes: + - name: google-cloud-key + secret: + secretName: per-gcp-access + containers: + - name: indexer + image: syntioinc/dataphos-persistor-indexer:1.0.0 + volumeMounts: + - mountPath: /var/secrets/google + name: google-cloud-key + ports: + - containerPort: 2112 + envFrom: + - configMapRef: + name: idx-config + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /var/secrets/google/key.json +--- +apiVersion: v1 +kind: Service +metadata: + name: persistor-metrics-svc + namespace: dataphos + labels: + app: persistor +spec: + type: LoadBalancer + ports: + - port: 2112 + selector: + app: persistor +--- +apiVersion: v1 +kind: Service +metadata: + name: indexer-metrics-svc + namespace: dataphos + labels: + app: indexer +spec: + type: LoadBalancer + ports: + - port: 2112 + selector: + app: indexer +--- +apiVersion: v1 +kind: Service +metadata: + name: indexer-api-svc + namespace: dataphos + labels: + app: indexer-api +spec: + type: LoadBalancer + ports: + - port: 8080 + selector: + app: indexer-api +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: idx-api-config + namespace: dataphos +data: + CONN: "mongodb://mongo-0.mongo-service.dataphos:27017" + DATABASE: "indexer_db" + MINIMUM_LOG_LEVEL: "INFO" + SERVER_ADDRESS: ":8080" + USE_TLS: "false" + SERVER_TIMEOUT: "10s" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: indexer-api-deployment + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: indexer-api + template: + metadata: + labels: + app: indexer-api + spec: + containers: + - name: indexer-api + image: syntioinc/dataphos-persistor-indexer-api:1.0.0 + envFrom: + - configMapRef: + name: idx-api-config +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: rsb-config + namespace: dataphos +data: + PUBSUB_PROJECT_ID: "" # change this + INDEXER_URL: "http://indexer-api-svc:8080" + MINIMUM_LOG_LEVEL: "INFO" + SERVER_ADDRESS: ":8081" + USE_TLS: "false" + SERVER_TIMEOUT: "10s" + RSB_META_CAPACITY: "20000" + RSB_FETCH_CAPACITY: "200" + RSB_WORKER_NUM: "3" + RSB_ENABLE_MESSAGE_ORDERING: "false" + STORAGE_TYPE: "gcs" # Do not change! + PUBLISHER_TYPE: "kafka" # Do not change! + KAFKA_BROKERS: "" # change this + KAFKA_USE_TLS: "false" + KAFKA_USE_SASL: "false" + SASL_USERNAME: "default" + SASL_PASSWORD: "default" + KAFKA_SKIP_VERIFY: "false" + KAFKA_DISABLE_COMPRESSION: "false" + KAFKA_BATCH_SIZE: "50" + KAFKA_BATCH_BYTES: "52428800" + KAFKA_BATCH_TIMEOUT: "10ms" + ENABLE_KERBEROS: "false" + KRB_CONFIG_PATH: "/path/to/config/file" + KRB_REALM: "REALM.com" + KRB_SERVICE_NAME: "kerberos-service" + KRB_KEY_TAB: "/path/to/file.keytab" + KRB_USERNAME: "user" +--- +apiVersion: v1 +kind: Service +metadata: + name: resubmitter-svc + namespace: dataphos + labels: + app: resubmitter +spec: + type: LoadBalancer + ports: + - port: 8081 + selector: + app: resubmitter +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: resubmitter-deployment + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: resubmitter + template: + metadata: + labels: + app: resubmitter + spec: + volumes: + - name: google-cloud-key + secret: + secretName: per-gcp-access + containers: + - name: resubmitter + image: syntioinc/dataphos-persistor-resubmitter:1.0.0 + volumeMounts: + - mountPath: /var/secrets/google + name: google-cloud-key + envFrom: + - configMapRef: + name: rsb-config + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /var/secrets/google/key.json \ No newline at end of file diff --git a/examples/publisher/ingress.yaml b/examples/publisher/ingress.yaml new file mode 100644 index 0000000..2834417 --- /dev/null +++ b/examples/publisher/ingress.yaml @@ -0,0 +1,28 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: publisher-webui-ingress + namespace: dataphos + annotations: + kubernetes.io/ingress.class : nginx + nginx.ingress.kubernetes.io/ssl-redirect : "true" + nginx.ingress.kubernetes.io/enable-cors : "true" + nginx.ingress.kubernetes.io/cors-allow-methods : "PUT, GET, POST, DELETE, OPTIONS" + nginx.ingress.kubernetes.io/cors-allow-origin : "*" + nginx.ingress.kubernetes.io/azure-load-balancer-health-probe-request-path: /healthz +spec: + rules: + - host: # insert your WEB UI domain name, same as in the Manager config map + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: publisher-webui + port: + number: 8080 + tls: + - hosts: + - # insert your WEB UI domain name + secretName: webui-tls-secret \ No newline at end of file diff --git a/examples/publisher/postgres_deployment.yaml b/examples/publisher/postgres_deployment.yaml new file mode 100644 index 0000000..2cf887b --- /dev/null +++ b/examples/publisher/postgres_deployment.yaml @@ -0,0 +1,78 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: publisher-source +--- + +apiVersion: v1 +kind: ConfigMap +metadata: + name: publisher-postgres-source-config + namespace: publisher-source +data: + PGDATA: /var/lib/postgresql/data/pgdata +--- + +apiVersion: v1 +kind: Secret +metadata: + name: publisher-postgres-source-secret + namespace: publisher-source +type: Opaque +stringData: + POSTGRES_DB: invoices + POSTGRES_USER: demo_user + POSTGRES_PASSWORD: demo_password +--- + +apiVersion: v1 +kind: Service +metadata: + name: publisher-postgres-source + namespace: publisher-source +spec: + selector: + app: publisher-postgres-source-db + ports: + - port: 5432 + type: LoadBalancer +--- + +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: publisher-postgres-source-db + namespace: publisher-source +spec: + serviceName: publisher-postgres-source + replicas: 1 + selector: + matchLabels: + app: publisher-postgres-source-db + template: + metadata: + labels: + app: publisher-postgres-source-db + spec: + containers: + - name: publisher-postgres-source + image: syntioinc/dataphos-publisher-source-example:1.0.0 + ports: + - containerPort: 5432 + envFrom: + - configMapRef: + name: publisher-postgres-source-config + - secretRef: + name: publisher-postgres-source-secret + volumeMounts: + - name: publisher-postgres-source-data-volume + mountPath: /var/lib/postgresql/data + volumeClaimTemplates: + - metadata: + name: publisher-postgres-source-data-volume + namespace: publisher-source + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 500M \ No newline at end of file diff --git a/examples/publisher/publisher.yaml b/examples/publisher/publisher.yaml new file mode 100644 index 0000000..bfac1fb --- /dev/null +++ b/examples/publisher/publisher.yaml @@ -0,0 +1,523 @@ +# Postgres metadata database +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-postgres-config + namespace: dataphos +data: + PGDATA: /var/lib/postgresql/data/pgdata +--- + +apiVersion: v1 +kind: Secret +metadata: + name: publisher-postgres-secret + namespace: dataphos +type: Opaque +stringData: + POSTGRES_DB: publisher # insert your database name, same as METADATA_DATABASE in configuration.yaml + POSTGRES_USER: publisher # insert your database username, same as METADATA_USERNAME in configuration.yaml + POSTGRES_PASSWORD: samplePassworD1212 # insert your database user password, same as METADATA_PASSWORD in configuration.yaml +--- + + +# Common configuration +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-metadata-config + namespace: dataphos +data: + METADATA_HOST: publisher-postgres.dataphos.svc + METADATA_PORT: "5432" + METADATA_DATABASE: publisher_metadata +--- + +apiVersion: v1 +kind: Secret +metadata: + name: publisher-metadata-secret + namespace: dataphos +type: Opaque +stringData: + METADATA_USERNAME: publisher # insert your database username + METADATA_PASSWORD: samplePassworD1212 # insert your database user password +--- + +# optional secret +apiVersion: v1 +kind: Secret +metadata: + name: kafka-tls-credentials + namespace: dataphos +type: Opaque +data: + "ca_crt.pem": "" # insert your base64 encoded Kafka cluster CA TLS certificate, leave empty if not needed (optional) + "client_crt.pem": "" # insert your base64 encoded Kafka user TLS certificate, leave empty if not needed (optional) + "client_key.pem": "" # insert your base64 encoded Kafka user TLS private key, leave empty if not needed (optional) +--- + +# optional secret +apiVersion: v1 +kind: Secret +metadata: + name: nats-tls-credentials + namespace: dataphos +type: Opaque +data: + "ca_crt.pem": "" # insert your base64 encoded Nats cluster CA TLS certificate, leave empty if not needed (optional) + "client_crt.pem": "" # insert your base64 encoded Nats user TLS certificate, leave empty if not needed (optional) + "client_key.pem": "" # insert your base64 encoded Nats user TLS private key, leave empty if not needed (optional) +--- + +# optional secret +apiVersion: v1 +kind: Secret +metadata: + name: pulsar-tls-credentials + namespace: dataphos +type: Opaque +data: + "ca_crt.pem": "" # insert your base64 encoded Nats cluster CA TLS certificate, leave empty if not needed (optional) + "client_crt.pem": "" # insert your base64 encoded Nats user TLS certificate, leave empty if not needed (optional) + "client_key.pem": "" # insert your base64 encoded Nats user TLS private key, leave empty if not needed (optional) +--- + +# optional secret +apiVersion: v1 +kind: Secret +metadata: + name: pubsub-key + namespace: dataphos +type: Opaque +data: + "key.json": "" # insert your base64 encoded Pub/Sub service account key, leave empty if publishing to Pub/Sub + # not needed (optional) +--- + +apiVersion: v1 +kind: Secret +metadata: + name: encryption-keys + namespace: dataphos +type: Opaque +stringData: # insert your encryption keys, one or more + "keys.yaml": | + ENC_KEY_1: "D2C0B5865AE141A49816F1FDC110FA5A" +--- +# Manager +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-manager-config + namespace: dataphos +data: + WEB_UI: # insert your webui domain name + FETCHER_URL: http://publisher-data-fetcher:8081 +--- + +apiVersion: v1 +kind: Secret +metadata: + name: publisher-manager-secret + namespace: dataphos +type: Opaque +stringData: + JWT_SECRET: SuperSecretPass! # insert your JWT secret key, 16 characters +--- + +# Data Fetcher +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-data-fetcher-config + namespace: dataphos +data: + MANAGER_URL: http://publisher-manager:8080 +--- + +# Scheduler +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-scheduler-config + namespace: dataphos +data: + WORKER_IMAGE: syntioinc/dataphos-publisher-worker:1.0.0 + FETCHER_URL: http://publisher-data-fetcher:8081 + SCHEMA_GENERATOR_URL: http://publisher-avro-schema-generator:8080 + SCHEMA_VALIDATION_URL: http:/ # insert the schema registry public URL or 0.0.0.0 if schema registry is not deployed + IMAGE_PULL_SECRET: regcred + KUBERNETES_NAMESPACE: dataphos + SECRET_NAME_PUBSUB: pubsub-key + SECRET_NAME_KAFKA: kafka-tls-credentials + SECRET_NAME_NATS: nats-tls-credentials + SECRET_NAME_PULSAR: pulsar-tls-credentials +--- + +# WebUI +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-webui-config + namespace: dataphos +data: + "server.properties": | + window.MANAGER_ENDPOINT = "/backend" +--- + +apiVersion: v1 +kind: Service +metadata: + name: publisher-postgres + namespace: dataphos +spec: + selector: + app: publisher-postgres-db + ports: + - port: 5432 +--- + +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: publisher-postgres-db + namespace: dataphos +spec: + serviceName: publisher-postgres + replicas: 1 + selector: + matchLabels: + app: publisher-postgres-db + template: + metadata: + labels: + app: publisher-postgres-db + spec: + containers: + - name: publisher-postgres + image: postgres:latest + ports: + - containerPort: 5432 + envFrom: + - configMapRef: + name: publisher-postgres-config + - secretRef: + name: publisher-postgres-secret + volumeMounts: + - name: publisher-postgres-volume + mountPath: /var/lib/postgresql/data + volumeClaimTemplates: + - metadata: + name: publisher-postgres-volume + namespace: publisher + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 20Gi +--- + +# Initialize metadata database +apiVersion: batch/v1 +kind: Job +metadata: + name: publisher-initdb + namespace: dataphos +spec: + template: + spec: + containers: + - name: initdb + image: syntioinc/dataphos-publisher-initdb:1.0.0 + ports: + - containerPort: 5432 + envFrom: + - configMapRef: + name: publisher-metadata-config + - secretRef: + name: publisher-metadata-secret + restartPolicy: OnFailure + backoffLimit: 15 +--- + + +# Avro Schema Generator +apiVersion: v1 +kind: Service +metadata: + name: publisher-avro-schema-generator + namespace: dataphos +spec: + selector: + app: server + component: avro-schema-generator + ports: + - protocol: TCP + port: 8080 +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-avro-schema-generator + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: server + component: avro-schema-generator + template: + metadata: + labels: + app: server + component: avro-schema-generator + annotations: + syntio.net/logme: "true" + spec: + containers: + - name: avro-schema-generator + image: syntioinc/dataphos-publisher-avro-schema-generator:1.0.0 + resources: + limits: + cpu: 500m + requests: + cpu: 50m + memory: 250Mi +--- + +apiVersion: v1 +kind: Service +metadata: + name: publisher-manager + namespace: dataphos +spec: + selector: + app: server + component: manager + ports: + - port: 8080 + targetPort: 8080 + type: ClusterIP +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-manager + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: server + component: manager + template: + metadata: + labels: + app: server + component: manager + annotations: + syntio.net/logme: "true" + spec: + containers: + - name: manager + image: syntioinc/dataphos-publisher-manager:1.0.0 + resources: + limits: + cpu: 100m + requests: + cpu: 5m + memory: 45Mi + ports: + - containerPort: 8080 + envFrom: + - configMapRef: + name: publisher-manager-config + - secretRef: + name: publisher-manager-secret + - configMapRef: + name: publisher-metadata-config + - secretRef: + name: publisher-metadata-secret +--- + +apiVersion: v1 +kind: Service +metadata: + name: publisher-data-fetcher + namespace: dataphos +spec: + selector: + app: server + component: data-fetcher + ports: + - protocol: TCP + port: 8081 +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-data-fetcher + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: server + component: data-fetcher + template: + metadata: + labels: + app: server + component: data-fetcher + annotations: + syntio.net/logme: "true" + spec: + initContainers: + - name: check-manager-health + image: curlimages/curl:7.85.0 + command: ['sh', '-c', 'while [ `curl -s -o /dev/null -w "%{http_code}" http://publisher-manager:8080` -ne 200 ]; do echo waiting for manager to be ready...; sleep 10; done;'] + containers: + - name: data-fetcher + image: syntioinc/dataphos-publisher-data-fetcher:1.0.0 + resources: + limits: + cpu: 600m + requests: + cpu: 200m + memory: 160Mi + ports: + - containerPort: 8081 + envFrom: + - configMapRef: + name: publisher-data-fetcher-config +--- + + +# Kubernetes Service Account +apiVersion: v1 +kind: ServiceAccount +metadata: + name: publisher-sa + namespace: dataphos +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: publisher-sa-role + namespace: dataphos +rules: + - apiGroups: [""] # "" indicates the core API group + resources: ["pods"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: publisher-sa-rb + namespace: dataphos +subjects: + - kind: ServiceAccount + name: publisher-sa +roleRef: + kind: Role + name: publisher-sa-role + apiGroup: rbac.authorization.k8s.io +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-scheduler + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: server + component: scheduler + template: + metadata: + labels: + app: server + component: scheduler + annotations: + syntio.net/logme: "true" + spec: + serviceAccountName: publisher-sa + containers: + - name: scheduler + image: syntioinc/dataphos-publisher-scheduler:1.0.0 + resources: + limits: + cpu: 100m + requests: + cpu: 5m + memory: 30Mi + envFrom: + - configMapRef: + name: publisher-scheduler-config + - configMapRef: + name: publisher-metadata-config + - secretRef: + name: publisher-metadata-secret +--- + +apiVersion: v1 +kind: Service +metadata: + name: publisher-webui + namespace: dataphos +spec: + selector: + app: webui + component: webui + ports: + - port: 8080 + type: NodePort +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-webui + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: webui + component: webui + template: + metadata: + labels: + app: webui + component: webui + spec: + containers: + - name: manager + image: syntioinc/dataphos-publisher-webui:1.0.0 + resources: + limits: + cpu: 100m + requests: + cpu: 5m + memory: 30Mi + ports: + - containerPort: 8080 + volumeMounts: + - name: publisher-webui-config-volume + mountPath: /usr/share/nginx/html/config.js + subPath: config.js + volumes: + - name: publisher-webui-config-volume + configMap: + name: publisher-webui-config + items: + - key: server.properties + path: config.js \ No newline at end of file diff --git a/examples/publisher/publisher_gcp.yaml b/examples/publisher/publisher_gcp.yaml new file mode 100644 index 0000000..bf6aa49 --- /dev/null +++ b/examples/publisher/publisher_gcp.yaml @@ -0,0 +1,552 @@ +# Namespace +apiVersion: v1 +kind: Namespace +metadata: + name: dataphos +--- + +# Postgres metadata database +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-postgres-config + namespace: dataphos +data: + PGDATA: /var/lib/postgresql/data/pgdata +--- + +apiVersion: v1 +kind: Secret +metadata: + name: publisher-postgres-secret + namespace: dataphos +type: Opaque +stringData: + POSTGRES_DB: dataphos_publisher # insert your database name, same as METADATA_DATABASE in configuration.yaml + POSTGRES_USER: publisher # insert your database username, same as METADATA_USERNAME in configuration.yaml + POSTGRES_PASSWORD: samplePassworD1212 # insert your database user password, same as METADATA_PASSWORD in configuration.yaml +--- + +# Common configuration +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-metadata-config + namespace: dataphos +data: + METADATA_HOST: publisher-postgres.dataphos.svc + METADATA_PORT: "5432" + METADATA_DATABASE: publisher_metadata +--- + +apiVersion: v1 +kind: Secret +metadata: + name: publisher-metadata-secret + namespace: dataphos +type: Opaque +stringData: + METADATA_USERNAME: publisher # insert your database username + METADATA_PASSWORD: samplePassworD1212 # insert your database user password +--- + +# optional secret +apiVersion: v1 +kind: Secret +metadata: + name: pubsub-key + namespace: dataphos +type: Opaque +data: + "key.json": "" # insert your base64 encoded Pub/Sub service account key, leave empty if publishing to Pub/Sub + # not needed (optional) +--- + +# optional secret +apiVersion: v1 +kind: Secret +metadata: + name: kafka-tls-credentials + namespace: dataphos +type: Opaque +data: + "ca_crt.pem": "" # insert your base64 encoded Kafka cluster CA TLS certificate, leave empty if not needed (optional) + "client_crt.pem": "" # insert your base64 encoded Kafka user TLS certificate, leave empty if not needed (optional) + "client_key.pem": "" # insert your base64 encoded Kafka user TLS private key, leave empty if not needed (optional) +--- + +# optional secret +apiVersion: v1 +kind: Secret +metadata: + name: nats-tls-credentials + namespace: dataphos +type: Opaque +data: + "ca_crt.pem": "" # insert your base64 encoded Nats cluster CA TLS certificate, leave empty if not needed (optional) + "client_crt.pem": "" # insert your base64 encoded Nats user TLS certificate, leave empty if not needed (optional) + "client_key.pem": "" # insert your base64 encoded Nats user TLS private key, leave empty if not needed (optional) +--- + +# optional secret +apiVersion: v1 +kind: Secret +metadata: + name: pulsar-tls-credentials + namespace: dataphos +type: Opaque +data: + "ca_crt.pem": "" # insert your base64 encoded Nats cluster CA TLS certificate, leave empty if not needed (optional) + "client_crt.pem": "" # insert your base64 encoded Nats user TLS certificate, leave empty if not needed (optional) + "client_key.pem": "" # insert your base64 encoded Nats user TLS private key, leave empty if not needed (optional) +--- + +apiVersion: v1 +kind: Secret +metadata: + name: encryption-keys + namespace: dataphos +type: Opaque +stringData: # insert your encryption keys, one or more + "keys.yaml": | + ENC_KEY_1: "D2C0B5865AE141A49816F1FDC110FA5A" +--- + +# Manager +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-manager-config + namespace: dataphos +data: + WEB_UI: https:// # insert your webui domain name + FETCHER_URL: http://publisher-data-fetcher:8081 +--- + +apiVersion: v1 +kind: Secret +metadata: + name: publisher-manager-secret + namespace: dataphos +type: Opaque +stringData: + JWT_SECRET: SuperSecretPass! # insert your JWT secret key, 16 characters +--- + +# Data Fetcher +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-data-fetcher-config + namespace: dataphos +data: + MANAGER_URL: http://publisher-manager:8080 +--- + +# Scheduler +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-scheduler-config + namespace: dataphos +data: + WORKER_IMAGE: syntioinc/dataphos-publisher-worker:1.0.0 + FETCHER_URL: http://publisher-data-fetcher:8081 + SCHEMA_GENERATOR_URL: http://publisher-avro-schema-generator:8080 + SCHEMA_VALIDATION_URL: http:// # insert the schema registry public URL or an empty string if schema registry is not deployed + IMAGE_PULL_SECRET: regcred + KUBERNETES_NAMESPACE: dataphos + SECRET_NAME_PUBSUB: pubsub-key + SECRET_NAME_KAFKA: kafka-tls-credentials + SECRET_NAME_NATS: nats-tls-credentials + SECRET_NAME_PULSAR: pulsar-tls-credentials +--- + +# WebUI +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-webui-config + namespace: dataphos +data: + "server.properties": | + window.MANAGER_ENDPOINT = "/backend" +--- + +apiVersion: v1 +kind: Service +metadata: + name: publisher-postgres + namespace: dataphos +spec: + selector: + app: publisher-postgres-db + ports: + - port: 5432 +--- + +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: publisher-postgres-db + namespace: dataphos +spec: + serviceName: publisher-postgres + replicas: 1 + selector: + matchLabels: + app: publisher-postgres-db + template: + metadata: + labels: + app: publisher-postgres-db + spec: + containers: + - name: publisher-postgres + image: postgres:latest + ports: + - containerPort: 5432 + envFrom: + - configMapRef: + name: publisher-postgres-config + - secretRef: + name: publisher-postgres-secret + volumeMounts: + - name: publisher-postgres-volume + mountPath: /var/lib/postgresql/data + volumeClaimTemplates: + - metadata: + name: publisher-postgres-volume + namespace: dataphos + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 20Gi +--- + +# Initialize metadata database +apiVersion: batch/v1 +kind: Job +metadata: + name: publisher-initdb + namespace: dataphos +spec: + template: + spec: + containers: + - name: initdb + image: syntioinc/dataphos-publisher-initdb:1.0.0 + ports: + - containerPort: 5432 + envFrom: + - configMapRef: + name: publisher-metadata-config + - secretRef: + name: publisher-metadata-secret + restartPolicy: OnFailure + backoffLimit: 15 +--- + + +# Avro Schema Generator +apiVersion: v1 +kind: Service +metadata: + name: publisher-avro-schema-generator + namespace: dataphos +spec: + selector: + app: server + component: avro-schema-generator + ports: + - protocol: TCP + port: 8080 +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-avro-schema-generator + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: server + component: avro-schema-generator + template: + metadata: + labels: + app: server + component: avro-schema-generator + annotations: + syntio.net/logme: "true" + spec: + containers: + - name: avro-schema-generator + image: syntioinc/dataphos-publisher-avro-schema-generator:1.0.0 + resources: + limits: + cpu: 500m + requests: + cpu: 50m + memory: 250Mi +--- + +apiVersion: v1 +kind: Service +metadata: + name: publisher-manager + namespace: dataphos +spec: + selector: + app: server + component: manager + ports: + - port: 8080 + targetPort: 8080 + type: ClusterIP +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-manager + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: server + component: manager + template: + metadata: + labels: + app: server + component: manager + annotations: + syntio.net/logme: "true" + spec: + containers: + - name: manager + image: syntioinc/dataphos-publisher-manager:1.0.0 + resources: + limits: + cpu: 100m + requests: + cpu: 5m + memory: 45Mi + ports: + - containerPort: 8080 + envFrom: + - configMapRef: + name: publisher-manager-config + - secretRef: + name: publisher-manager-secret + - configMapRef: + name: publisher-metadata-config + - secretRef: + name: publisher-metadata-secret +--- + +apiVersion: v1 +kind: Service +metadata: + name: publisher-data-fetcher + namespace: dataphos +spec: + selector: + app: server + component: data-fetcher + ports: + - protocol: TCP + port: 8081 +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-data-fetcher + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: server + component: data-fetcher + template: + metadata: + labels: + app: server + component: data-fetcher + annotations: + syntio.net/logme: "true" + spec: + initContainers: + - name: check-manager-health + image: curlimages/curl:7.85.0 + command: ['sh', '-c', 'while [ `curl -s -o /dev/null -w "%{http_code}" http://publisher-manager:8080` -ne 200 ]; do echo waiting for manager to be ready...; sleep 10; done;'] + containers: + - name: data-fetcher + image: syntioinc/dataphos-publisher-data-fetcher:1.0.0 + resources: + limits: + cpu: 600m + requests: + cpu: 200m + memory: 160Mi + ports: + - containerPort: 8081 + envFrom: + - configMapRef: + name: publisher-data-fetcher-config +--- + + +# Kubernetes Service Account +apiVersion: v1 +kind: ServiceAccount +metadata: + name: publisher-sa + namespace: dataphos +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: publisher-sa-role + namespace: dataphos +rules: + - apiGroups: [""] # "" indicates the core API group + resources: ["pods"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: publisher-sa-rb + namespace: dataphos +subjects: + - kind: ServiceAccount + name: publisher-sa +roleRef: + kind: Role + name: publisher-sa-role + apiGroup: rbac.authorization.k8s.io +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-scheduler + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: server + component: scheduler + template: + metadata: + labels: + app: server + component: scheduler + annotations: + syntio.net/logme: "true" + spec: + serviceAccountName: publisher-sa + containers: + - name: scheduler + image: syntioinc/dataphos-publisher-scheduler:1.0.0 + resources: + limits: + cpu: 100m + requests: + cpu: 5m + memory: 30Mi + envFrom: + - configMapRef: + name: publisher-scheduler-config + - configMapRef: + name: publisher-metadata-config + - secretRef: + name: publisher-metadata-secret +--- + +apiVersion: v1 +kind: Service +metadata: + name: publisher-webui + namespace: dataphos +spec: + selector: + app: webui + component: webui + ports: + - port: 8080 + type: NodePort +--- + +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: publisher-webui-ingress + namespace: dataphos + annotations: + kubernetes.io/ingress.global-static-ip-name: # insert the name of your static IP address for Web UI ingress + ingress.gcp.kubernetes.io/pre-shared-cert: # insert the name of your Google managed certificate +spec: + rules: + - host: # insert your webui domain name, same as in the Manager config map + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: publisher-webui + port: + number: 8080 +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-webui + namespace: dataphos +spec: + replicas: 1 + selector: + matchLabels: + app: webui + component: webui + template: + metadata: + labels: + app: webui + component: webui + spec: + containers: + - name: manager + image: syntioinc/dataphos-publisher-webui:1.0.0 + resources: + limits: + cpu: 100m + requests: + cpu: 5m + memory: 30Mi + ports: + - containerPort: 8080 + volumeMounts: + - name: publisher-webui-config-volume + mountPath: /usr/share/nginx/html/config.js + subPath: config.js + volumes: + - name: publisher-webui-config-volume + configMap: + name: publisher-webui-config + items: + - key: server.properties + path: config.js \ No newline at end of file diff --git a/examples/publisher/secrets.yaml b/examples/publisher/secrets.yaml new file mode 100644 index 0000000..1263be6 --- /dev/null +++ b/examples/publisher/secrets.yaml @@ -0,0 +1,16 @@ +# Namespace +apiVersion: v1 +kind: Namespace +metadata: + name: dataphos +--- + +apiVersion: v1 +kind: Secret +metadata: + name: webui-tls-secret + namespace: dataphos +type: kubernetes.io/tls +stringData: + tls.crt: + tls.key: \ No newline at end of file diff --git a/examples/publisher/v3.conf b/examples/publisher/v3.conf new file mode 100644 index 0000000..3b16bce --- /dev/null +++ b/examples/publisher/v3.conf @@ -0,0 +1,18 @@ +[req] +distinguished_name = req_distinguished_name +x509_extensions = v3_req +prompt = no +[req_distinguished_name] +C = +ST = +L = +O = +OU = +CN = +[v3_req] +keyUsage = nonRepudiation, digitalSignature, keyEncipherment +extendedKeyUsage = serverAuth +subjectAltName = @alt_names +[alt_names] +DNS.1 = +DNS.2 = \ No newline at end of file diff --git a/examples/sr-registry/sr-registry.yaml b/examples/sr-registry/sr-registry.yaml new file mode 100644 index 0000000..0fc7e81 --- /dev/null +++ b/examples/sr-registry/sr-registry.yaml @@ -0,0 +1,220 @@ +apiVersion: v1 +kind: Secret +metadata: + name: schema-registry-secret + namespace: dataphos +type: Opaque +stringData: + POSTGRES_PASSWORD: $postgres_password # insert password here + PGDATA: /data/pgdata + SR_HOST: schema-history-svc + SR_TABLE_PREFIX: syntio_schema. + SR_DBNAME: postgres + SR_USER: postgres + SERVER_PORT: "8080" + +--- +# Schema history service +apiVersion: "v1" +kind: "Service" +metadata: + name: "schema-history-svc" + namespace: dataphos +spec: + ports: + - protocol: "TCP" + port: 5432 + targetPort: 5432 + selector: + app: "schema-history" + type: "ClusterIP" +--- +# Schema history (PostgreSQL database that stores the schemas) +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: "schema-history" + namespace: dataphos + annotations: + "syntio.net/logme": "true" +spec: + serviceName: "schema-history-svc" + selector: + matchLabels: + app: "schema-history" + replicas: 1 + template: + metadata: + labels: + app: "schema-history" + spec: + containers: + - name: "schema-history" + image: postgres:latest + ports: + - containerPort: 5432 + env: + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: schema-registry-secret + key: POSTGRES_PASSWORD + - name: PGDATA + valueFrom: + secretKeyRef: + name: schema-registry-secret + key: PGDATA + volumeMounts: + - mountPath: /data + name: "schema-history-disk" + # Volume Claim + volumeClaimTemplates: + - metadata: + name: "schema-history-disk" + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 25Gi +--- +# Registry service +apiVersion: "v1" +kind: "Service" +metadata: + name: "schema-registry-svc" + namespace: dataphos +spec: + ports: + - name: http + port: 8080 + targetPort: http + - name: compatiblity + port: 8088 + targetPort: compatiblity + - name: validity + port: 8089 + targetPort: validity + selector: + app: "schema-registry" + type: "LoadBalancer" + loadBalancerIP: "" +--- +# Registry deployment +apiVersion: apps/v1 +kind: Deployment +metadata: + name: schema-registry + namespace: dataphos + annotations: + "syntio.net/logme": "true" +spec: + replicas: 1 + selector: + matchLabels: + app: schema-registry + template: + metadata: + labels: + app: schema-registry + spec: + volumes: + - name: google-cloud-key + secret: + secretName: service-account-credentials + initContainers: + - name: check-schema-history-health + image: busybox + command: [ + "/bin/sh", + "-c", + "until nc -zv schema-history-svc 5432 -w1; do echo 'waiting for db'; sleep 1; done" + ] + - name: initdb + image: syntioinc/dataphos-schema-registry-initdb:1.0.0 + env: + - name: SR_PASSWORD + valueFrom: + secretKeyRef: + name: schema-registry-secret + key: POSTGRES_PASSWORD + - name: SR_HOST + valueFrom: + secretKeyRef: + name: schema-registry-secret + key: SR_HOST + - name: SR_TABLE_PREFIX + valueFrom: + secretKeyRef: + name: schema-registry-secret + key: SR_TABLE_PREFIX + - name: SR_DBNAME + valueFrom: + secretKeyRef: + name: schema-registry-secret + key: SR_DBNAME + - name: SR_USER + valueFrom: + secretKeyRef: + name: schema-registry-secret + key: SR_USER + securityContext: + privileged: false + containers: + - name: gke-sr + image: syntioinc/dataphos-schema-registry-api:1.0.0 + env: + - name: SR_PASSWORD + valueFrom: + secretKeyRef: + name: schema-registry-secret + key: POSTGRES_PASSWORD + - name: SR_HOST + valueFrom: + secretKeyRef: + name: schema-registry-secret + key: SR_HOST + - name: SR_TABLE_PREFIX + valueFrom: + secretKeyRef: + name: schema-registry-secret + key: SR_TABLE_PREFIX + - name: SR_DBNAME + valueFrom: + secretKeyRef: + name: schema-registry-secret + key: SR_DBNAME + - name: SR_USER + valueFrom: + secretKeyRef: + name: schema-registry-secret + key: SR_USER + - name: SERVER_PORT + valueFrom: + secretKeyRef: + name: schema-registry-secret + key: SERVER_PORT + - name: COMPATIBILITY_CHECKER_URL + value: "http://localhost:8088" + - name: VALIDITY_CHECKER_URL + value: "http://localhost:8089" + resources: + limits: + cpu: "400m" + memory: "500Mi" + requests: + cpu: "400m" + memory: "500Mi" + ports: + - name: http + containerPort: 8080 + - name: compatibility-checker + image: syntioinc/dataphos-schema-registry-compatibility:1.0.0 + ports: + - name: compatibility + containerPort: 8088 + - name: validity-checker + image: syntioinc/dataphos-schema-registry-validity:1.0.0 + ports: + - name: validity + containerPort: 8089 +--- diff --git a/examples/sr-validator/sr-validator-general.yaml b/examples/sr-validator/sr-validator-general.yaml new file mode 100644 index 0000000..411488d --- /dev/null +++ b/examples/sr-validator/sr-validator-general.yaml @@ -0,0 +1,106 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: centralconsumer-config + namespace: dataphos +data: +# Uncomment the type you want to use and fill the values for it + +# CONSUMER_TYPE: "kafka" +# CONSUMER_KAFKA_ADDRESS: +# CONSUMER_KAFKA_TOPIC: +# CONSUMER_KAFKA_GROUP_ID: + +# CONSUMER_TYPE: "pubsub" +# CONSUMER_PUBSUB_PROJECT_ID: +# CONSUMER_PUBSUB_SUBSCRIPTION_ID: + +# CONSUMER_TYPE: "servicebus" +# CONSUMER_SERVICEBUS_CONNECTION_STRING: +# CONSUMER_SERVICEBUS_TOPIC: +# CONSUMER_SERVICEBUS_SUBSCRIPTION: + + +# PRODUCER_TYPE: "kafka" +# PRODUCER_KAFKA_ADDRESS: + +# PRODUCER_TYPE: "pubsub" +# PRODUCER_PUBSUB_PROJECT_ID: + +# PRODUCER_TYPE: "servicebus" +# PRODUCER_SERVICEBUS_CONNECTION_STRING: + + TOPICS_VALID: + TOPICS_DEAD_LETTER: + + REGISTRY_URL: "http://schema-registry-svc:8080" + VALIDATORS_ENABLE_JSON: + VALIDATORS_ENABLE_AVRO: + VALIDATORS_ENABLE_PROTOBUF: + VALIDATORS_ENABLE_CSV: + VALIDATORS_CSV_URL: "http://csv-validator-svc:8080" + VALIDATORS_ENABLE_XML: + VALIDATORS_XML_URL: "http://xml-validator-svc:8081" + +--- +apiVersion: "v1" +kind: "Service" +metadata: + name: "metrics" + namespace: dataphos +spec: + ports: + - name: metrics + port: 2112 + targetPort: 2112 + selector: + app: "centralconsumer" + type: "LoadBalancer" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: centralconsumer + namespace: dataphos + annotations: + "syntio.net/logme": "true" +spec: + replicas: 1 + selector: + matchLabels: + app: centralconsumer + template: + metadata: + labels: + app: centralconsumer + spec: + volumes: + - name: google-cloud-key + secret: + secretName: service-account-credentials + containers: + - name: centralconsumer + image: syntioinc/dataphos-schema-registry-validator:1.0.0 + resources: + limits: + cpu: "125m" + memory: "80Mi" + requests: + cpu: "125m" + memory: "40Mi" + volumeMounts: + - mountPath: /var/secrets/google + name: google-cloud-key + envFrom: + - configMapRef: + name: centralconsumer-config + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /var/secrets/google/key.json + ports: + - name: metrics + containerPort: 2112 + imagePullSecrets: + - name: nexuscred + +--- diff --git a/examples/sr-validator/sr-validator-kafka-to-pubsub.yaml b/examples/sr-validator/sr-validator-kafka-to-pubsub.yaml new file mode 100644 index 0000000..c78a3ad --- /dev/null +++ b/examples/sr-validator/sr-validator-kafka-to-pubsub.yaml @@ -0,0 +1,88 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: centralconsumer-config + namespace: dataphos +data: + CONSUMER_TYPE: "kafka" + CONSUMER_KAFKA_ADDRESS: $consumer_address # insert consumer bootstrap server here + CONSUMER_KAFKA_TOPIC: $consumer_topic # insert consumer topic + CONSUMER_KAFKA_GROUP_ID: $consumer_group_id # insert consumer group ID + + PRODUCER_TYPE: "pubsub" + PRODUCER_PUBSUB_PROJECT_ID: $producer_project_ID # insert GCP project ID + + TOPICS_VALID: $producer_valid_topic_ID # insert producer valid topic + TOPICS_DEAD_LETTER: $producer_deadletter_topic_ID # insert producer dead-letter topic + + REGISTRY_URL: "http://schema-registry-svc:8080" + VALIDATORS_ENABLE_JSON: "true" + VALIDATORS_ENABLE_AVRO: "false" + VALIDATORS_ENABLE_PROTOBUF: "false" + VALIDATORS_ENABLE_CSV: "false" + VALIDATORS_CSV_URL: "http://csv-validator-svc:8080" + VALIDATORS_ENABLE_XML: "false" + VALIDATORS_XML_URL: "http://xml-validator-svc:8081" + +--- +apiVersion: "v1" +kind: "Service" +metadata: + name: "metrics" + namespace: dataphos +spec: + ports: + - name: metrics + port: 2112 + targetPort: 2112 + selector: + app: "centralconsumer" + type: "LoadBalancer" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: centralconsumer + namespace: dataphos + annotations: + "syntio.net/logme": "true" +spec: + replicas: 1 + selector: + matchLabels: + app: centralconsumer + template: + metadata: + labels: + app: centralconsumer + spec: + volumes: + - name: google-cloud-key + secret: + secretName: service-account-credentials + containers: + - name: centralconsumer + image: syntioinc/dataphos-schema-registry-validator:1.0.0 + resources: + limits: + cpu: "125m" + memory: "80Mi" + requests: + cpu: "125m" + memory: "40Mi" + volumeMounts: + - mountPath: /var/secrets/google + name: google-cloud-key + envFrom: + - configMapRef: + name: centralconsumer-config + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /var/secrets/google/key.json + ports: + - name: metrics + containerPort: 2112 + imagePullSecrets: + - name: nexuscred + +--- diff --git a/examples/sr-validator/sr-validator-kafka.yaml b/examples/sr-validator/sr-validator-kafka.yaml new file mode 100644 index 0000000..e060279 --- /dev/null +++ b/examples/sr-validator/sr-validator-kafka.yaml @@ -0,0 +1,76 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: centralconsumer-config + namespace: dataphos +data: + CONSUMER_TYPE: "kafka" + CONSUMER_KAFKA_ADDRESS: $consumer_address # insert consumer bootstrap server here + CONSUMER_KAFKA_TOPIC: $consumer_topic # insert consumer topic + CONSUMER_KAFKA_GROUP_ID: $consumer_group_id # insert consumer group ID + + PRODUCER_TYPE: "kafka" + PRODUCER_KAFKA_ADDRESS: $producer_address # insert producer bootstrap server here + + TOPICS_VALID: $producer_valid_topic_ID # insert producer valid topic + TOPICS_DEAD_LETTER: $producer_deadletter_topic_ID # insert producer dead-letter topic + + REGISTRY_URL: "http://schema-registry-svc:8080" + VALIDATORS_ENABLE_JSON: "true" + VALIDATORS_ENABLE_AVRO: "false" + VALIDATORS_ENABLE_PROTOBUF: "false" + VALIDATORS_ENABLE_CSV: "false" + VALIDATORS_CSV_URL: "http://csv-validator-svc:8080" + VALIDATORS_ENABLE_XML: "false" + VALIDATORS_XML_URL: "http://xml-validator-svc:8081" + +--- +apiVersion: "v1" +kind: "Service" +metadata: + name: "metrics" + namespace: dataphos +spec: + ports: + - name: metrics + port: 2112 + targetPort: 2112 + selector: + app: "centralconsumer" + type: "LoadBalancer" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: centralconsumer + namespace: dataphos + annotations: + "syntio.net/logme": "true" +spec: + replicas: 1 + selector: + matchLabels: + app: centralconsumer + template: + metadata: + labels: + app: centralconsumer + containers: + - name: centralconsumer + image: syntioinc/dataphos-schema-registry-validator:1.0.0 + resources: + limits: + cpu: "125m" + memory: "80Mi" + requests: + cpu: "125m" + memory: "40Mi" + envFrom: + - configMapRef: + name: centralconsumer-config + ports: + - name: metrics + containerPort: 2112 + imagePullSecrets: + - name: nexuscred +--- diff --git a/examples/sr-validator/sr-validator-pubsub.yaml b/examples/sr-validator/sr-validator-pubsub.yaml new file mode 100644 index 0000000..dabe1e2 --- /dev/null +++ b/examples/sr-validator/sr-validator-pubsub.yaml @@ -0,0 +1,87 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: centralconsumer-config + namespace: dataphos +data: + CONSUMER_TYPE: "pubsub" + CONSUMER_PUBSUB_PROJECT_ID: $consumer_project_ID # insert consumer GCP project ID + CONSUMER_PUBSUB_SUBSCRIPTION_ID: $consumer_subscription_ID # insert producer pubsub subscription ID + PRODUCER_TYPE: "pubsub" + PRODUCER_PUBSUB_PROJECT_ID: $producer_project_ID # insert producer GCP project ID + + TOPICS_VALID: $producer_valid_topic_ID # insert producer valid topic + TOPICS_DEAD_LETTER: $producer_deadletter_topic_ID # insert producer dead-letter topic + + REGISTRY_URL: "http://schema-registry-svc:8080" + VALIDATORS_ENABLE_JSON: "true" + VALIDATORS_ENABLE_AVRO: "false" + VALIDATORS_ENABLE_PROTOBUF: "false" + VALIDATORS_ENABLE_CSV: "false" + VALIDATORS_CSV_URL: "http://csv-validator-svc:8080" + VALIDATORS_ENABLE_XML: "false" + VALIDATORS_XML_URL: "http://xml-validator-svc:8081" + +--- + +apiVersion: "v1" +kind: "Service" +metadata: + name: "metrics" + namespace: dataphos +spec: + ports: + - name: metrics + port: 2112 + targetPort: 2112 + selector: + app: "centralconsumer" + type: "LoadBalancer" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: centralconsumer + namespace: dataphos + annotations: + "syntio.net/logme": "true" +spec: + replicas: 1 + selector: + matchLabels: + app: centralconsumer + template: + metadata: + labels: + app: centralconsumer + spec: + volumes: + - name: google-cloud-key + secret: + secretName: service-account-credentials + containers: + - name: centralconsumer + image: syntioinc/dataphos-schema-registry-validator:1.0.0 + resources: + limits: + cpu: "125m" + memory: "80Mi" + requests: + cpu: "125m" + memory: "40Mi" + volumeMounts: + - mountPath: /var/secrets/google + name: google-cloud-key + envFrom: + - configMapRef: + name: centralconsumer-config + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /var/secrets/google/key.json + ports: + - name: metrics + containerPort: 2112 + imagePullSecrets: + - name: nexuscred + +--- diff --git a/examples/sr-validator/sr-validator-servicebus.yaml b/examples/sr-validator/sr-validator-servicebus.yaml new file mode 100644 index 0000000..425d720 --- /dev/null +++ b/examples/sr-validator/sr-validator-servicebus.yaml @@ -0,0 +1,78 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: centralconsumer-config + namespace: dataphos +data: + CONSUMER_TYPE: "servicebus" + CONSUMER_SERVICEBUS_CONNECTION_STRING: $consumer_servicebus_connection_string # insert the consumer service bus connection string + CONSUMER_SERVICEBUS_TOPIC: consumer_servicebus_topic # insert te consumer service bus topic + CONSUMER_SERVICEBUS_SUBSCRIPTION: $consumer_servicebus_subscription # insert te consumer service bus subsription + + PRODUCER_TYPE: "servicebus" + PRODUCER_SERVICEBUS_CONNECTION_STRING: $producer_servicebus_connection_string # insert the producer service bus connection string + + TOPICS_VALID: $producer_valid_topic_ID # insert producer valid topic + TOPICS_DEAD_LETTER: $producer_deadletter_topic_ID # insert producer dead-letter topic + + REGISTRY_URL: "http://schema-registry-svc:8080" + VALIDATORS_ENABLE_JSON: "true" + VALIDATORS_ENABLE_AVRO: "false" + VALIDATORS_ENABLE_PROTOBUF: "false" + VALIDATORS_ENABLE_CSV: "false" + VALIDATORS_CSV_URL: "http://csv-validator-svc:8080" + VALIDATORS_ENABLE_XML: "false" + VALIDATORS_XML_URL: "http://xml-validator-svc:8081" + +--- +apiVersion: "v1" +kind: "Service" +metadata: + name: "metrics" + namespace: dataphos +spec: + ports: + - name: metrics + port: 2112 + targetPort: 2112 + selector: + app: "centralconsumer" + type: "LoadBalancer" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: centralconsumer + namespace: dataphos + annotations: + "syntio.net/logme": "true" +spec: + replicas: 1 + selector: + matchLabels: + app: centralconsumer + template: + metadata: + labels: + app: centralconsumer + spec: + containers: + - name: centralconsumer + image: syntioinc/dataphos-schema-registry-validator:1.0.0 + resources: + limits: + cpu: "125m" + memory: "80Mi" + requests: + cpu: "125m" + memory: "40Mi" + envFrom: + - configMapRef: + name: centralconsumer-config + ports: + - name: metrics + containerPort: 2112 + imagePullSecrets: + - name: nexuscred + +--- diff --git a/scripts/persistor/delete_deployments.sh b/scripts/persistor/delete_deployments.sh new file mode 100644 index 0000000..0080ce0 --- /dev/null +++ b/scripts/persistor/delete_deployments.sh @@ -0,0 +1,226 @@ +#!/bin/bash +kubectl delete secret per-gcp-access -n dataphos +kubectl delete -f - < 1 + for: 1m + labels: + severity: slack + annotations: + summary: High Memory Usage + prometheus.yml: |- + global: + scrape_interval: 5s + evaluation_interval: 5s + rule_files: + - /etc/prometheus/prometheus.rules + alerting: + alertmanagers: + - scheme: http + static_configs: + - targets: + - "alertmanager.$namespace.svc:9093" + scrape_configs: + - job_name: schema-registry + scrape_interval: 5s + metrics_path: "/metrics" + static_configs: + - targets: ["metrics:2112"] + - job_name: 'node-exporter' + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [__meta_kubernetes_endpoints_name] + regex: 'node-exporter' + action: keep + - job_name: 'kubernetes-apiservers' + kubernetes_sd_configs: + - role: endpoints + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: default;kubernetes;https + - job_name: 'kubernetes-nodes' + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics + - job_name: 'kubernetes-pods' + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name + - job_name: 'kube-state-metrics' + static_configs: + - targets: ['kube-state-metrics.kube-system.svc.cluster.local:8080'] + - job_name: 'kubernetes-cadvisor' + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + - job_name: 'kubernetes-service-endpoints' + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: kubernetes_name + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus-deployment + namespace: $namespace + labels: + app: prometheus-server +spec: + replicas: 1 + selector: + matchLabels: + app: prometheus-server + template: + metadata: + labels: + app: prometheus-server + spec: + containers: + - name: prometheus + image: prom/prometheus + args: + - "--storage.tsdb.retention.time=12h" + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.path=/prometheus/" + ports: + - containerPort: 9090 + resources: + requests: + cpu: 500m + memory: 500M + limits: + cpu: 1 + memory: 1Gi + volumeMounts: + - name: prometheus-config-volume + mountPath: /etc/prometheus/ + - name: prometheus-storage-volume + mountPath: /prometheus/ + volumes: + - name: prometheus-config-volume + configMap: + defaultMode: 420 + name: prometheus-server-conf + + - name: prometheus-storage-volume + emptyDir: {} +--- +EOF \ No newline at end of file diff --git a/scripts/publisher/publisher.ps1 b/scripts/publisher/publisher.ps1 new file mode 100644 index 0000000..6e4b770 --- /dev/null +++ b/scripts/publisher/publisher.ps1 @@ -0,0 +1,519 @@ +#! /usr/bin/pwsh + +if($args.Count -ne 3){ + Write-Host "please specify all required variables" + exit 1 +} + +$namespace=$args[0] +$postgres_user=$args[1] +$postgres_password=$args[2] + +$myYaml = @" +# Manager and Web UI services +apiVersion: v1 +kind: Service +metadata: + name: publisher-manager + namespace: $namespace +spec: + selector: + app: server + component: manager + ports: + - port: 8080 + targetPort: 8080 + type: ClusterIP +--- +apiVersion: v1 +kind: Service +metadata: + name: publisher-webui + namespace: $namespace +spec: + selector: + app: webui + component: webui + ports: + - port: 8080 + type: LoadBalancer +--- +# Postgres metadata database +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-postgres-config + namespace: $namespace +data: + PGDATA: /var/lib/postgresql/data/pgdata +--- +apiVersion: v1 +kind: Secret +metadata: + name: publisher-postgres-secret + namespace: $namespace +type: Opaque +stringData: + POSTGRES_USER: $postgres_user + POSTGRES_PASSWORD: $postgres_password +--- +apiVersion: v1 +kind: Service +metadata: + name: publisher-postgres + namespace: $namespace +spec: + selector: + app: publisher-postgres-db + ports: + - port: 5432 +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: publisher-postgres-db + namespace: $namespace +spec: + serviceName: publisher-postgres + replicas: 1 + selector: + matchLabels: + app: publisher-postgres-db + template: + metadata: + labels: + app: publisher-postgres-db + spec: + containers: + - name: publisher-postgres + image: postgres:latest + ports: + - containerPort: 5432 + envFrom: + - configMapRef: + name: publisher-postgres-config + - secretRef: + name: publisher-postgres-secret + volumeMounts: + - name: publisher-postgres-volume + mountPath: /var/lib/postgresql/data + volumeClaimTemplates: + - metadata: + name: publisher-postgres-volume + namespace: $namespace + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 20Gi +--- +# Common configuration +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-metadata-config + namespace: $namespace +data: + METADATA_HOST: publisher-postgres + METADATA_PORT: "5432" + METADATA_DATABASE: publisher_metadata +--- +apiVersion: v1 +kind: Secret +metadata: + name: publisher-metadata-secret + namespace: $namespace +type: Opaque +stringData: + METADATA_USERNAME: $postgres_user # insert your database username + METADATA_PASSWORD: $postgres_password # insert your database user password +--- +# optional secret +apiVersion: v1 +kind: Secret +metadata: + name: pubsub-key + namespace: $namespace +type: Opaque +data: + "key.json": "" # insert your base64 encoded Pub/Sub service account key, leave empty if publishing to Pub/Sub + # not needed (optional) +--- +# optional secret +apiVersion: v1 +kind: Secret +metadata: + name: kafka-tls-credentials + namespace: $namespace +type: Opaque +data: + "ca_crt.pem": "" # insert your base64 encoded Kafka cluster CA TLS certificate, leave empty if not needed (optional) + "client_crt.pem": "" # insert your base64 encoded Kafka user TLS certificate, leave empty if not needed (optional) + "client_key.pem": "" # insert your base64 encoded Kafka user TLS private key, leave empty if not needed (optional) +--- +# optional secret +apiVersion: v1 +kind: Secret +metadata: + name: nats-tls-credentials + namespace: $namespace +type: Opaque +data: + "ca_crt.pem": "" # insert your base64 encoded Nats cluster CA TLS certificate, leave empty if not needed (optional) + "client_crt.pem": "" # insert your base64 encoded Nats user TLS certificate, leave empty if not needed (optional) + "client_key.pem": "" # insert your base64 encoded Nats user TLS private key, leave empty if not needed (optional) +--- +# optional secret +apiVersion: v1 +kind: Secret +metadata: + name: pulsar-tls-credentials + namespace: $namespace +type: Opaque +data: + "ca_crt.pem": "" # insert your base64 encoded Nats cluster CA TLS certificate, leave empty if not needed (optional) + "client_crt.pem": "" # insert your base64 encoded Nats user TLS certificate, leave empty if not needed (optional) + "client_key.pem": "" # insert your base64 encoded Nats user TLS private key, leave empty if not needed (optional) +--- +apiVersion: v1 +kind: Secret +metadata: + name: encryption-keys + namespace: $namespace +type: Opaque +stringData: # insert your encryption keys, one or more + "keys.yaml": | + ENC_KEY_1: "D2C0B5865AE141A49816F1FDC110FA5A" +--- +# Initialize metadata database +apiVersion: batch/v1 +kind: Job +metadata: + name: publisher-initdb + namespace: $namespace +spec: + template: + spec: + containers: + - name: initdb + image: syntioinc/dataphos-publisher-initdb:1.0.0 + ports: + - containerPort: 5432 + envFrom: + - configMapRef: + name: publisher-metadata-config + - secretRef: + name: publisher-metadata-secret + restartPolicy: OnFailure + backoffLimit: 15 +--- +# Avro Schema Generator +apiVersion: v1 +kind: Service +metadata: + name: publisher-avro-schema-generator + namespace: $namespace +spec: + selector: + app: server + component: avro-schema-generator + ports: + - protocol: TCP + port: 8080 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-avro-schema-generator + namespace: $namespace +spec: + replicas: 1 + selector: + matchLabels: + app: server + component: avro-schema-generator + template: + metadata: + labels: + app: server + component: avro-schema-generator + annotations: + syntio.net/logme: "true" + spec: + containers: + - name: avro-schema-generator + image: syntioinc/dataphos-publisher-avro-schema-generator:1.0.0 + resources: + limits: + cpu: 500m + requests: + cpu: 50m + memory: 250Mi +--- +# Kubernetes Service Account +apiVersion: v1 +kind: ServiceAccount +metadata: + name: publisher-sa + namespace: $namespace +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: publisher-sa-role + namespace: $namespace +rules: + - apiGroups: [""] # "" indicates the core API group + resources: ["pods"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: publisher-sa-rb + namespace: $namespace +subjects: + - kind: ServiceAccount + name: publisher-sa +roleRef: + kind: Role + name: publisher-sa-role + apiGroup: rbac.authorization.k8s.io +--- +# Scheduler +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-scheduler-config + namespace: $namespace +data: + WORKER_IMAGE: syntioinc/dataphos-publisher-worker:1.0.0 + FETCHER_URL: http://publisher-data-fetcher:8081 + SCHEMA_GENERATOR_URL: http://publisher-avro-schema-generator:8080 + SCHEMA_VALIDATION_URL: "" # insert the schema registry public URL or an empty string if schema registry is not deployed + IMAGE_PULL_SECRET: regcred + KUBERNETES_NAMESPACE: $namespace + SECRET_NAME_PUBSUB: pubsub-key + SECRET_NAME_KAFKA: kafka-tls-credentials + SECRET_NAME_NATS: nats-tls-credentials + SECRET_NAME_PULSAR: pulsar-tls-credentials +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-scheduler + namespace: $namespace +spec: + replicas: 1 + selector: + matchLabels: + app: server + component: scheduler + template: + metadata: + labels: + app: server + component: scheduler + annotations: + syntio.net/logme: "true" + spec: + serviceAccountName: publisher-sa + containers: + - name: scheduler + image: syntioinc/dataphos-publisher-scheduler:1.0.0 + resources: + limits: + cpu: 100m + requests: + cpu: 5m + memory: 30Mi + envFrom: + - configMapRef: + name: publisher-scheduler-config + - configMapRef: + name: publisher-metadata-config + - secretRef: + name: publisher-metadata-secret +--- +"@ + +$myYaml |kubectl.exe apply -f - +do{ + $webui_ip = $(kubectl get services --namespace $namespace publisher-webui --output jsonpath='{.status.loadBalancer.ingress[0].ip}') + Write-Host "Waiting for Web UI service to be created..." + Start-Sleep -Seconds 10 +}while($null -eq $webui_ip) +Write-Host "$webui_ip" + +$oneYaml = @" +# Manager +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-manager-config + namespace: $namespace +data: + WEB_UI: http://$($webui_ip):8080 + FETCHER_URL: http://publisher-data-fetcher:8081 +--- +apiVersion: v1 +kind: Secret +metadata: + name: publisher-manager-secret + namespace: $namespace +type: Opaque +stringData: + JWT_SECRET: "DuperSecretPass!" # insert your JWT secret key, 16 characters +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-manager + namespace: $namespace +spec: + replicas: 1 + selector: + matchLabels: + app: server + component: manager + template: + metadata: + labels: + app: server + component: manager + annotations: + syntio.net/logme: "true" + spec: + containers: + - name: manager + image: syntioinc/dataphos-publisher-manager:1.0.0 + resources: + limits: + cpu: 100m + requests: + cpu: 5m + memory: 45Mi + ports: + - containerPort: 8080 + envFrom: + - configMapRef: + name: publisher-manager-config + - secretRef: + name: publisher-manager-secret + - configMapRef: + name: publisher-metadata-config + - secretRef: + name: publisher-metadata-secret +--- +# WebUI +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-webui-config + namespace: $namespace +data: # insert your manager domain name + "server.properties": | + window.MANAGER_ENDPOINT = "/backend" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-webui + namespace: $namespace +spec: + replicas: 1 + selector: + matchLabels: + app: webui + component: webui + template: + metadata: + labels: + app: webui + component: webui + spec: + containers: + - name: manager + image: syntioinc/dataphos-publisher-webui:1.0.0 + resources: + limits: + cpu: 100m + requests: + cpu: 5m + memory: 30Mi + ports: + - containerPort: 8080 + volumeMounts: + - name: publisher-webui-config-volume + mountPath: /usr/share/nginx/html/config.js + subPath: config.js + volumes: + - name: publisher-webui-config-volume + configMap: + name: publisher-webui-config + items: + - key: server.properties + path: config.js +"@ + +$oneYaml |kubectl.exe apply -f - + +$newYaml = @" +# Data Fetcher +kind: ConfigMap +apiVersion: v1 +metadata: + name: publisher-data-fetcher-config + namespace: $namespace +data: + MANAGER_URL: http://publisher-manager:8080 +--- +apiVersion: v1 +kind: Service +metadata: + name: publisher-data-fetcher + namespace: $namespace +spec: + selector: + app: server + component: data-fetcher + ports: + - protocol: TCP + port: 8081 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: publisher-data-fetcher + namespace: $namespace +spec: + replicas: 1 + selector: + matchLabels: + app: server + component: data-fetcher + template: + metadata: + labels: + app: server + component: data-fetcher + annotations: + syntio.net/logme: "true" + spec: + containers: + - name: data-fetcher + image: syntioinc/dataphos-publisher-data-fetcher:1.0.0 + resources: + limits: + cpu: 600m + requests: + cpu: 200m + memory: 160Mi + ports: + - containerPort: 8081 + envFrom: + - configMapRef: + name: publisher-data-fetcher-config +--- +"@ +$newYaml |kubectl.exe apply -f - \ No newline at end of file diff --git a/scripts/publisher/publisher.sh b/scripts/publisher/publisher.sh new file mode 100644 index 0000000..e042bb2 --- /dev/null +++ b/scripts/publisher/publisher.sh @@ -0,0 +1,534 @@ +#!/bin/bash + +Help() +{ +echo "Flags:" +echo "-n - the namespace where Publisher will be deployed" +echo "-u - username for the Postgres metadata database" +echo "-p - password for the Postgres metadata database" +} + +if [ $# -eq 1 ] & [ $1 == "--help" ]; then + Help + exit +fi + +if [ $# -ne 6 ]; then + echo "Please specify all required variables" + exit 1 +fi + +while getopts n:u:p: flag +do + case "${flag}" in + n) namespace=${OPTARG};; + u) postgres_user=${OPTARG};; + p) postgres_password=${OPTARG};; + esac +done + +kubectl apply -f - <