DM Chaos #8112

Workflow file for this run

.github/workflows/dm_chaos.yaml at 9d91f5a

	name: DM Chaos

	on:
	schedule:
	- cron: '0 17-23 * * *' # run at minute 0 every hour from 01:00 ~ 07:00 UTC+8
	workflow_dispatch:
	inputs:
	pr:
	description: 'Which PR do you want to trigger'
	required: true
	default: ''

	# See: https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#concurrency.
	concurrency:
	group: ${{ github.ref }}-${{ github.workflow }}
	cancel-in-progress: true

	# A workflow run is made up of one or more jobs that can run sequentially or in parallel
	jobs:
	# This workflow contains a single job called "base"
	base:
	# The type of runner that the job will run on
	runs-on: ubuntu-20.04
	timeout-minutes: 50
	strategy:
	fail-fast: false
	matrix:
	chaos-obj:
	[
	"pod-failure-dm",
	"pod-kill-dm",
	"network-partition-dm",
	"network-emulation-dm",
	"io-chaos-dm",
	]

	# Steps represent a sequence of tasks that will be executed as part of the job
	steps:
	# Set up Go for building DM
	- name: Set up Go env
	uses: actions/setup-go@v3
	with:
	go-version: '1.23'
	- name: Print Go version
	run: go version

	# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
	- name: Check out code
	uses: actions/checkout@v2

	- name: Check out code by workflow dispatch
	if: ${{ github.event.inputs.pr != '' }}
	uses: actions/checkout@v2
	with:
	ref: refs/pull/${{ github.event.inputs.pr }}/head

	- name: Cache go modules
	uses: actions/cache@v2
	with:
	path: ~/go/pkg/mod
	key: ${{ runner.os }}-ticdc-${{ hashFiles('go.sum') }}

	- name: Cache Tools
	id: cache-tools
	uses: actions/cache@v2
	with:
	path: tools/bin
	key: ${{ runner.os }}-ticdc-tools-${{ hashFiles('tools/check/go.sum') }}

	- name: install k3s
	run: \|
	curl -fsSL https://get.k3s.io \| sh -s - --write-kubeconfig-mode 644 \
	"${k3s_disable_command:---disable}" metrics-server \
	"${k3s_disable_command:---disable}" traefik \
	--flannel-backend=none \
	--docker
	shell: bash

	- name: Export KUBECONFIG environment variable
	run: \|
	echo 'KUBECONFIG=/etc/rancher/k3s/k3s.yaml' >> $GITHUB_ENV
	shell: bash

	- name: Print cluster information
	run: \|
	kubectl config view
	kubectl cluster-info
	kubectl get nodes
	kubectl get pods -n kube-system
	kubectl get sc
	kubectl version
	helm version

	# Disable AppArmor for MySQL, see https://github.com/moby/moby/issues/7512#issuecomment-61787845
	- name: Disable AppArmor for MySQL
	run: \|
	sudo ln -s /etc/apparmor.d/usr.sbin.mysqld /etc/apparmor.d/disable/
	sudo apparmor_parser -R /etc/apparmor.d/usr.sbin.mysqld

	- name: Build DM binary
	run: make dm-master dm-worker dmctl dm-chaos-case

	# NOTE: we also copy config files into `bin` directory,
	# so we only need to send `bin` as the context into docker daemon when building image.
	- name: Build DM docker image
	run: \|
	cp -r $GITHUB_WORKSPACE/dm/chaos/cases/conf/ $GITHUB_WORKSPACE/bin/
	docker build -f $GITHUB_WORKSPACE/dm/chaos/manifests/Dockerfile -t dm:chaos $GITHUB_WORKSPACE/bin
	docker image list

	# Set up upstream instances
	- name: Set up sources
	run: \|
	kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml
	kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml
	kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml
	- name: Wait for sources ready # kubectl wait --all not working
	run: \|
	kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=300s \|\| true
	kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=300s \|\| true
	kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=300s \|\| true
	sleep 10
	echo show pvc
	kubectl get pvc -l app=sources -o wide
	echo show pv
	kubectl get pv -o wide
	echo show svc
	kubectl get svc -l app=sources -o wide
	echo show sts
	kubectl get sts -l app=sources -o wide
	echo show po
	kubectl get po -l app=sources -o wide
	echo describe po
	kubectl describe po -l app=sources
	echo describe pvc
	kubectl describe pvc -l app=sources
	kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=0s
	kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=0s
	kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=0s

	# Set up downstream TiDB instance (deploy a TiDB with mockTiKV, not a TidbCluster managed by TiDB-operator)
	- name: Set up TiDB
	run: \|
	kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml
	kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml
	kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml
	- name: Wait for TiDB ready
	run: \|
	kubectl wait --for=condition=Ready pod/tidb-0 --timeout=300s \|\| true
	echo show pvc
	kubectl get pvc -l app=tidb -o wide
	echo show pv
	kubectl get pv -o wide
	echo show svc
	kubectl get svc -l app=tidb -o wide
	echo show sts
	kubectl get sts -l app=tidb -o wide
	echo show po
	kubectl get po -l app=tidb -o wide
	echo describe po
	kubectl describe po -l app=tidb
	echo describe pvc
	kubectl describe pvc -l app=tidb
	kubectl wait --for=condition=Ready pod/tidb-0 --timeout=0s

	- name: Set up DM-master
	run: \|
	kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/dm-master.yaml
	kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/dm-master.yaml
	kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/dm-master.yaml
	# NOTE: even some DM-master instances are not ready, we still continue and let chaos test cases to check again.
	- name: Wait for DM-master ready
	run: \|
	sleep 10
	kubectl wait --for=condition=Ready pod -l app=dm-master --all --timeout=300s \|\| true
	echo "<<<<< show pvc >>>>>"
	kubectl get pvc -l app=dm-master -o wide
	echo "<<<<< show pv >>>>>"
	kubectl get pv -o wide
	echo "<<<<< show svc >>>>>"
	kubectl get svc -l app=dm-master -o wide
	echo "<<<<< show sts >>>>>"
	kubectl get sts -l app=dm-master -o wide
	echo "<<<<< show po >>>>>"
	kubectl get po -l app=dm-master -o wide
	echo "<<<<< describe po >>>>>"
	kubectl describe po -l app=dm-master
	echo "<<<<< describe pvc >>>>>"
	kubectl describe pvc -l app=dm-master
	echo "<<<<< show current log for dm-master-0 >>>>>"
	kubectl logs dm-master-0 \|\| true
	echo "<<<<< show previous log for dm-master-0 >>>>>"
	kubectl logs dm-master-0 -p \|\| true
	echo "<<<<< show current log for dm-master-1 >>>>>"
	kubectl logs dm-master-1 \|\| true
	echo "<<<<< show previous log for dm-master-1 >>>>>"
	kubectl logs dm-master-1 -p \|\| true
	echo "<<<<< show current log for dm-master-2 >>>>>"
	kubectl logs dm-master-2 \|\| true
	echo "<<<<< show previous log for dm-master-2 >>>>>"
	kubectl logs dm-master-2 -p \|\| true

	- name: Set up DM-worker
	run: \|
	kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/dm-worker.yaml
	kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/dm-worker.yaml
	kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/dm-worker.yaml
	# NOTE: even some DM-worker instances are not ready, we still continue and let chaos test cases to check again.
	- name: Wait for DM-worker ready
	run: \|
	sleep 10
	kubectl wait --for=condition=Ready pod -l app=dm-worker --all --timeout=300s \|\| true
	echo "<<<<< show pvc >>>>>"
	kubectl get pvc -l app=dm-worker -o wide
	echo "<<<<< show pv >>>>>"
	kubectl get pv -o wide
	echo "<<<<< show svc >>>>>"
	kubectl get svc -l app=dm-worker -o wide
	echo "<<<<< show sts >>>>>"
	kubectl get sts -l app=dm-worker -o wide
	echo "<<<<< show po >>>>>"
	kubectl get po -l app=dm-worker -o wide
	echo "<<<<< describe po >>>>>"
	kubectl describe po -l app=dm-worker
	echo "<<<<< describe pvc >>>>>"
	kubectl describe pvc -l app=dm-worker
	echo "<<<<< show current log for dm-worker-0 >>>>>"
	kubectl logs dm-worker-0 \|\| true
	echo "<<<<< show previous log for dm-worker-0 >>>>>"
	kubectl logs dm-worker-0 -p \|\| true
	echo "<<<<< show current log for dm-worker-1 >>>>>"
	kubectl logs dm-worker-1 \|\| true
	echo "<<<<< show previous log for worker-master-1 >>>>>"
	kubectl logs dm-worker-1 -p \|\| true
	echo "<<<<< show current log for dm-worker-2 >>>>>"
	kubectl logs dm-worker-2 \|\| true
	echo "<<<<< show previous log for dm-worker-2 >>>>>"
	kubectl logs dm-worker-2 -p \|\| true

	# NOTE: we sleep a while when check members ready in cases before applying any chaos operations.
	- name: Set up chaos test cases
	run: \|
	kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/cases.yaml
	kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/cases.yaml
	kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/cases.yaml
	sleep 60

	- name: Encode chaos-mesh action
	run: \|
	echo CFG_BASE64=$(base64 -w 0 $GITHUB_WORKSPACE/dm/chaos/manifests/${{ matrix.chaos-obj }}.yaml) >> $GITHUB_ENV

	- name: Run chaos mesh action
	uses: chaos-mesh/chaos-mesh-action@master
	env:
	CFG_BASE64: ${{ env.CFG_BASE64 }}

	# check whether complete with 1m * 20 times.
	- name: Wait for chaos test case complete
	run: \|
	$GITHUB_WORKSPACE/dm/chaos/scripts/check-case.sh

	- name: Setup tmate session
	if: ${{ failure() }}
	uses: mxschmitt/action-tmate@v3

	- name: Copy logs to hack permission
	if: ${{ always() }}
	run: \|
	mkdir ./logs
	kubectl get pods --no-headers -o custom-columns=":metadata.name"\|grep -E "dm-"\|xargs -I{} kubectl cp {}:/log/{}.log ./logs/{}.log \|\| true
	sudo chown -R runner ./logs
	# Update logs as artifact seems not stable, so we set `continue-on-error: true` here.
	- name: Upload logs
	continue-on-error: true
	uses: actions/upload-artifact@v4
	if: ${{ always() }}
	with:
	name: chaos-base-logs.${{ matrix.chaos-obj }}
	path: \|
	./logs

	# send Slack notify if failed.
	# NOTE: With the exception of `GITHUB_TOKEN`, secrets are not passed to the runner when a workflow is triggered from a forked repository.
	- name: Slack notification
	if: ${{ failure() }}
	env:
	SLACK_WEBHOOK: ${{ secrets.SLACK_NOTIFY }}
	uses: Ilshidur/[email protected]
	with:
	args: "chaos job failed, see https://github.com/pingcap/tiflow/actions/runs/{{ GITHUB_RUN_ID }}"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

DM Chaos #8112

Workflow file

DM Chaos #8112

Jobs

Run details

Workflow file for this run