diff --git a/.env.local b/.env.local index bb076a46a..2d2603a94 100644 --- a/.env.local +++ b/.env.local @@ -89,7 +89,6 @@ export ONETL_SFTP_PASSWORD=AesujeifohgoaCu0Boosiet5aimeitho # Samba export ONETL_SAMBA_HOST=localhost -export ONETL_SAMBA_PROTOCOL=SMB export ONETL_SAMBA_UID=1000 export ONETL_SAMBA_GID=1000 export ONETL_SAMBA_PORT=445 diff --git a/.github/workflows/data/clickhouse/matrix.yml b/.github/workflows/data/clickhouse/matrix.yml index 9c8c558ba..587750990 100644 --- a/.github/workflows/data/clickhouse/matrix.yml +++ b/.github/workflows/data/clickhouse/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 @@ -22,7 +22,7 @@ latest: &latest matrix: small: - clickhouse-image: clickhouse/clickhouse-server - clickhouse-version: 23.6.1-alpine + clickhouse-version: 24.3.2.23-alpine <<: *max full: # the lowest supported Clickhouse version by JDBC driver @@ -30,7 +30,7 @@ matrix: clickhouse-version: '20.7' <<: *min - clickhouse-image: clickhouse/clickhouse-server - clickhouse-version: 23.6.1-alpine + clickhouse-version: 24.3.2.23-alpine <<: *max nightly: - clickhouse-image: yandex/clickhouse-server diff --git a/.github/workflows/data/core/matrix.yml b/.github/workflows/data/core/matrix.yml index a7339e139..d20f074ab 100644 --- a/.github/workflows/data/core/matrix.yml +++ b/.github/workflows/data/core/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/core/tracked.txt b/.github/workflows/data/core/tracked.txt index da678a6a1..5b2a3ca4d 100644 --- a/.github/workflows/data/core/tracked.txt +++ b/.github/workflows/data/core/tracked.txt @@ -2,5 +2,7 @@ onetl/hooks/** onetl/plugins/** onetl/impl/** onetl/hwm/** +onetl/_util/** onetl/_internal.py onetl/log.py +.github/workflows/data/core/** diff --git a/.github/workflows/data/ftp/matrix.yml b/.github/workflows/data/ftp/matrix.yml index 49468d914..d01c39029 100644 --- a/.github/workflows/data/ftp/matrix.yml +++ b/.github/workflows/data/ftp/matrix.yml @@ -15,7 +15,7 @@ latest: &latest matrix: small: - # chonjay21/ftps image has only latest tag + # chonjay21/ftps image has only latest tag - ftp-version: latest <<: *max full: diff --git a/.github/workflows/data/ftps/matrix.yml b/.github/workflows/data/ftps/matrix.yml index ec5a862cd..efe28e79a 100644 --- a/.github/workflows/data/ftps/matrix.yml +++ b/.github/workflows/data/ftps/matrix.yml @@ -15,7 +15,7 @@ latest: &latest matrix: small: - # chonjay21/ftps image has only latest tag + # chonjay21/ftps image has only latest tag - ftps-version: latest <<: *max full: diff --git a/.github/workflows/data/greenplum/matrix.yml b/.github/workflows/data/greenplum/matrix.yml index 2b66c0e19..28ec20e75 100644 --- a/.github/workflows/data/greenplum/matrix.yml +++ b/.github/workflows/data/greenplum/matrix.yml @@ -28,14 +28,14 @@ matrix: package-version: 2.3.1 <<: *max full: - - greenplum-version: 6.25.3 + - greenplum-version: 6.23.1 package-version: 2.2.0 <<: *min - greenplum-version: 7.0.0 package-version: 2.3.1 <<: *max nightly: - - greenplum-version: 6.25.3 + - greenplum-version: 6.23.1 package-version: 2.2.0 <<: *min - greenplum-version: 7.0.0 diff --git a/.github/workflows/data/hdfs/matrix.yml b/.github/workflows/data/hdfs/matrix.yml index e62f0242a..6d8156c50 100644 --- a/.github/workflows/data/hdfs/matrix.yml +++ b/.github/workflows/data/hdfs/matrix.yml @@ -8,7 +8,7 @@ min: &min max: &max hadoop-version: hadoop3-hdfs - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/hive/matrix.yml b/.github/workflows/data/hive/matrix.yml index 0f7d4ba6b..6ce0d7a8e 100644 --- a/.github/workflows/data/hive/matrix.yml +++ b/.github/workflows/data/hive/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/kafka/matrix.yml b/.github/workflows/data/kafka/matrix.yml index 29e587721..ab0932921 100644 --- a/.github/workflows/data/kafka/matrix.yml +++ b/.github/workflows/data/kafka/matrix.yml @@ -8,9 +8,9 @@ min: &min os: ubuntu-latest max: &max - kafka-version: 3.5.1 + kafka-version: 3.7.0 pydantic-version: 2 - spark-version: 3.5.0 + spark-version: 3.5.1 python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/local-fs/matrix.yml b/.github/workflows/data/local-fs/matrix.yml index b3db2391f..d1337291e 100644 --- a/.github/workflows/data/local-fs/matrix.yml +++ b/.github/workflows/data/local-fs/matrix.yml @@ -20,6 +20,7 @@ min_excel: &min_excel os: ubuntu-latest max: &max + # Excel package currently has no release for 3.5.1 spark-version: 3.5.0 pydantic-version: 2 python-version: '3.12' diff --git a/.github/workflows/data/mongodb/matrix.yml b/.github/workflows/data/mongodb/matrix.yml index c916cc306..68c19956d 100644 --- a/.github/workflows/data/mongodb/matrix.yml +++ b/.github/workflows/data/mongodb/matrix.yml @@ -7,7 +7,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.4.2 + spark-version: 3.4.3 pydantic-version: 2 python-version: '3.12' java-version: 20 @@ -22,12 +22,12 @@ latest: &latest matrix: small: - - mongodb-version: 6.0.7 + - mongodb-version: 7.0.8 <<: *max full: - mongodb-version: 4.0.0 <<: *min - - mongodb-version: 6.0.7 + - mongodb-version: 7.0.8 <<: *max nightly: - mongodb-version: 4.0.0 diff --git a/.github/workflows/data/mssql/matrix.yml b/.github/workflows/data/mssql/matrix.yml index 0138805bb..19ba2f3e3 100644 --- a/.github/workflows/data/mssql/matrix.yml +++ b/.github/workflows/data/mssql/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/mysql/matrix.yml b/.github/workflows/data/mysql/matrix.yml index 9b64e3b93..cd96a63b9 100644 --- a/.github/workflows/data/mysql/matrix.yml +++ b/.github/workflows/data/mysql/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 @@ -21,17 +21,17 @@ latest: &latest matrix: small: - - mysql-version: 8.0.33 + - mysql-version: 8.3.0 <<: *max full: - # Min supported version by JDBC driver is 5.7 - - mysql-version: 5.7.42 + # Min supported version by JDBC driver is 5.7 + - mysql-version: 5.7.6 <<: *min - # Max supported version by JDBC driver is 8.0 - - mysql-version: 8.0.33 + # Max supported version by JDBC driver is 8.3 + - mysql-version: 8.3.0 <<: *max nightly: - - mysql-version: 5.7.42 + - mysql-version: 5.7.6 <<: *min - mysql-version: latest <<: *latest diff --git a/.github/workflows/data/oracle/matrix.yml b/.github/workflows/data/oracle/matrix.yml index 55dc4c185..db67b6b71 100644 --- a/.github/workflows/data/oracle/matrix.yml +++ b/.github/workflows/data/oracle/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/postgres/matrix.yml b/.github/workflows/data/postgres/matrix.yml index 8cdff4f63..7b8e296e5 100644 --- a/.github/workflows/data/postgres/matrix.yml +++ b/.github/workflows/data/postgres/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 @@ -21,13 +21,13 @@ latest: &latest matrix: small: - - postgres-version: 15.2-alpine + - postgres-version: 16.2-alpine <<: *max full: - # Min supported version by JDBC driver is 8.4, but it is too ancient to be used by anyone in real life + # Min supported version by JDBC driver is 8.4, but it is too ancient to be used by anyone in real life - postgres-version: 9.4.26-alpine <<: *min - - postgres-version: 15.2-alpine + - postgres-version: 16.2-alpine <<: *max nightly: - postgres-version: 9.4.26-alpine diff --git a/.github/workflows/data/s3/matrix.yml b/.github/workflows/data/s3/matrix.yml index a0825603b..d9b9338f8 100644 --- a/.github/workflows/data/s3/matrix.yml +++ b/.github/workflows/data/s3/matrix.yml @@ -9,8 +9,8 @@ min: &min os: ubuntu-latest max: &max - minio-version: 2023.7.18 - spark-version: 3.5.0 + minio-version: 2024.4.18 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/samba/matrix.yml b/.github/workflows/data/samba/matrix.yml index 5b0b2628e..b1e6b56da 100644 --- a/.github/workflows/data/samba/matrix.yml +++ b/.github/workflows/data/samba/matrix.yml @@ -15,6 +15,7 @@ latest: &latest matrix: small: + # elswork/samba image versions does not correlate with smbd version, it is always 4.x - server-version: latest <<: *max full: diff --git a/.github/workflows/data/sftp/matrix.yml b/.github/workflows/data/sftp/matrix.yml index 0dfd9e730..a32f6f823 100644 --- a/.github/workflows/data/sftp/matrix.yml +++ b/.github/workflows/data/sftp/matrix.yml @@ -15,13 +15,13 @@ latest: &latest matrix: small: - - openssh-version: 9.3_p1-r3-ls120 + - openssh-version: 9.6_p1-r0-ls154 <<: *max full: - # prior image versions does not accept incoming connections, seems like a bug + # prior image versions does not accept incoming connections, seems like a bug - openssh-version: 8.1_p1-r0-ls5 <<: *min - - openssh-version: 9.3_p1-r3-ls120 + - openssh-version: 9.6_p1-r0-ls154 <<: *max nightly: - openssh-version: 8.1_p1-r0-ls5 diff --git a/.github/workflows/data/teradata/matrix.yml b/.github/workflows/data/teradata/matrix.yml index 9647daec6..6c2a55455 100644 --- a/.github/workflows/data/teradata/matrix.yml +++ b/.github/workflows/data/teradata/matrix.yml @@ -1,5 +1,5 @@ max: &max - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/webdav/matrix.yml b/.github/workflows/data/webdav/matrix.yml index 8d8f012a7..fb76e3282 100644 --- a/.github/workflows/data/webdav/matrix.yml +++ b/.github/workflows/data/webdav/matrix.yml @@ -15,7 +15,7 @@ latest: &latest matrix: small: - # chonjay21/webdav image has only latest tag + # chonjay21/webdav image has only latest tag - webdav-version: latest <<: *max full: diff --git a/.github/workflows/test-clickhouse.yml b/.github/workflows/test-clickhouse.yml index 6c790cbc5..4f8d436ec 100644 --- a/.github/workflows/test-clickhouse.yml +++ b/.github/workflows/test-clickhouse.yml @@ -83,10 +83,6 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/clickhouse.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for Clickhouse to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 8123 -t 60 - - name: Run tests run: | mkdir reports/ || echo "Directory exists" diff --git a/.github/workflows/test-ftp.yml b/.github/workflows/test-ftp.yml index 4e947d738..e41e1f3eb 100644 --- a/.github/workflows/test-ftp.yml +++ b/.github/workflows/test-ftp.yml @@ -50,20 +50,15 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/ftp.txt -r requirements/tests/base.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - # Replace with Github Actions' services after https://github.com/chonjay21/docker-ftp/pull/3 - # Cannot use services because we need to mount config file from the repo, but services start before checkout. - # See https://github.com/orgs/community/discussions/25792 + # Replace with Github Actions' services after https://github.com/chonjay21/docker-ftp/pull/3 + # Cannot use services because we need to mount config file from the repo, but services start before checkout. + # See https://github.com/orgs/community/discussions/25792 - name: Start FTP run: | docker compose down -v --remove-orphans - docker compose up -d ftp + docker compose up -d ftp --wait --wait --wait-timeout 200 env: FTP_IMAGE: chonjay21/ftps:${{ inputs.ftp-version }} - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-ftp${{ inputs.ftp-version }} - - - name: Wait for FTP to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 2121 -t 60 - name: Run tests run: | @@ -76,8 +71,6 @@ jobs: if: always() run: | docker compose down -v --remove-orphans - env: - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-ftp${{ inputs.ftp-version }} - name: Upload coverage results uses: actions/upload-artifact@v4 diff --git a/.github/workflows/test-ftps.yml b/.github/workflows/test-ftps.yml index 19cce458c..4fb9c6234 100644 --- a/.github/workflows/test-ftps.yml +++ b/.github/workflows/test-ftps.yml @@ -50,20 +50,15 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/ftp.txt -r requirements/tests/base.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - # Replace with Github Actions' services after https://github.com/chonjay21/docker-ftps/pull/3 - # Cannot use services because we need to mount config file from the repo, but services start before checkout. - # See https://github.com/orgs/community/discussions/25792 + # Replace with Github Actions' services after https://github.com/chonjay21/docker-ftps/pull/3 + # Cannot use services because we need to mount config file from the repo, but services start before checkout. + # See https://github.com/orgs/community/discussions/25792 - name: Start FTPS run: | docker compose down -v --remove-orphans - docker compose up -d ftps + docker compose up -d ftps --wait --wait --wait-timeout 200 env: FTPS_IMAGE: chonjay21/ftps:${{ inputs.ftps-version }} - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-ftps${{ inputs.ftps-version }} - - - name: Wait for FTPS to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 2122 -t 60 - name: Run tests run: | @@ -76,8 +71,6 @@ jobs: if: always() run: | docker compose down -v --remove-orphans - env: - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-ftps${{ inputs.ftps-version }} - name: Upload coverage results uses: actions/upload-artifact@v4 diff --git a/.github/workflows/test-hdfs.yml b/.github/workflows/test-hdfs.yml index 918e4f091..6e52a5df1 100644 --- a/.github/workflows/test-hdfs.yml +++ b/.github/workflows/test-hdfs.yml @@ -70,8 +70,8 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/kerberos.txt -r requirements/hdfs.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - # Cannot use services because we need to mount config file from the repo, but services start before checkout. - # See https://github.com/orgs/community/discussions/25792 + # Cannot use services because we need to mount config file from the repo, but services start before checkout. + # See https://github.com/orgs/community/discussions/25792 - name: Start HDFS run: | docker compose down -v --remove-orphans @@ -81,11 +81,6 @@ jobs: wait $wait_pid env: HDFS_IMAGE: mtsrus/hadoop:${{ inputs.hadoop-version }} - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-hadoop${{ inputs.hadoop-version }} - - - name: Wait for HDFS to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 9870 -t 60 - name: Run tests run: | @@ -99,8 +94,6 @@ jobs: if: always() run: | docker compose down -v --remove-orphans - env: - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-hadoop${{ inputs.hadoop-version }} - name: Upload coverage results uses: actions/upload-artifact@v4 diff --git a/.github/workflows/test-kafka.yml b/.github/workflows/test-kafka.yml index 120ac3a40..5dd5b3cfb 100644 --- a/.github/workflows/test-kafka.yml +++ b/.github/workflows/test-kafka.yml @@ -116,11 +116,6 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/kafka.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for Kafka to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 9093 -t 60 - ./docker/wait-for-it.sh -h localhost -p 9095 -t 60 - - name: Run tests run: | mkdir reports/ || echo "Directory exists" diff --git a/.github/workflows/test-mongodb.yml b/.github/workflows/test-mongodb.yml index ea230132f..a617450b6 100644 --- a/.github/workflows/test-mongodb.yml +++ b/.github/workflows/test-mongodb.yml @@ -81,10 +81,6 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mongodb.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for MongoDB to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 27017 -t 60 - - name: Run tests run: | mkdir reports/ || echo "Directory exists" diff --git a/.github/workflows/test-mssql.yml b/.github/workflows/test-mssql.yml index 23d315a93..1d5ebb853 100644 --- a/.github/workflows/test-mssql.yml +++ b/.github/workflows/test-mssql.yml @@ -84,10 +84,6 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mssql.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for MSSQL to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 1433 -t 60 - - name: Run tests run: | mkdir reports/ || echo "Directory exists" diff --git a/.github/workflows/test-mysql.yml b/.github/workflows/test-mysql.yml index 66bda2e10..e2035cfc7 100644 --- a/.github/workflows/test-mysql.yml +++ b/.github/workflows/test-mysql.yml @@ -83,10 +83,6 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mysql.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for MySQL to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 3306 -t 60 - - name: Run tests run: | mkdir reports/ || echo "Directory exists" diff --git a/.github/workflows/test-oracle.yml b/.github/workflows/test-oracle.yml index 2438fce1d..e11a57b84 100644 --- a/.github/workflows/test-oracle.yml +++ b/.github/workflows/test-oracle.yml @@ -98,10 +98,6 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/oracle.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for Oracle to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 1522 -t 60 - - name: Run tests run: | export ONETL_ORA_CLIENT_PATH=./oracle/instantclient_21_10 diff --git a/.github/workflows/test-postgres.yml b/.github/workflows/test-postgres.yml index 87fd34731..ef31a0375 100644 --- a/.github/workflows/test-postgres.yml +++ b/.github/workflows/test-postgres.yml @@ -82,10 +82,6 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/postgres.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for Postgres to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 5432 -t 60 - - name: Run tests run: | mkdir reports/ || echo "Directory exists" diff --git a/.github/workflows/test-s3.yml b/.github/workflows/test-s3.yml index 96775f3bf..8da4540cd 100644 --- a/.github/workflows/test-s3.yml +++ b/.github/workflows/test-s3.yml @@ -83,10 +83,6 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/s3.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for S3 to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 9010 -t 60 - - name: Run tests run: | mkdir reports/ || echo "Directory exists" diff --git a/.github/workflows/test-samba.yml b/.github/workflows/test-samba.yml index 3a7c1c921..58db08b88 100644 --- a/.github/workflows/test-samba.yml +++ b/.github/workflows/test-samba.yml @@ -57,11 +57,6 @@ jobs: docker compose up -d samba env: SAMBA_IMAGE: elswork/samba:${{ inputs.server-version }} - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-samba${{ inputs.server-version }} - - - name: Wait for Samba to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 445 -t 60 - name: Run tests run: | @@ -74,8 +69,6 @@ jobs: if: always() run: | docker compose down -v --remove-orphans - env: - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-samba${{ inputs.server-version }} - name: Upload coverage results uses: actions/upload-artifact@v4 diff --git a/.github/workflows/test-sftp.yml b/.github/workflows/test-sftp.yml index 569d580f7..ffbf786f2 100644 --- a/.github/workflows/test-sftp.yml +++ b/.github/workflows/test-sftp.yml @@ -60,10 +60,6 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/sftp.txt -r requirements/tests/base.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for SFTP to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 2222 -t 60 - - name: Run tests run: | mkdir reports/ || echo "Directory exists" diff --git a/.github/workflows/test-webdav.yml b/.github/workflows/test-webdav.yml index ee23f0ae8..472519643 100644 --- a/.github/workflows/test-webdav.yml +++ b/.github/workflows/test-webdav.yml @@ -59,11 +59,6 @@ jobs: docker compose up -d webdav env: WEBDAV_IMAGE: chonjay21/webdav:${{ inputs.webdav-version }} - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-webdav${{ inputs.webdav-version }} - - - name: Wait for WebDAV to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 8000 -t 60 - name: Run tests run: | @@ -76,8 +71,6 @@ jobs: if: always() run: | docker compose down -v --remove-orphans - env: - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-webdav${{ inputs.webdav-version }} - name: Upload coverage results uses: actions/upload-artifact@v4 diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 3d132ddb5..e7a60fc13 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -11,7 +11,7 @@ Limitations We should keep close to these items during development: -* Some companies still use old Spark versions, like 2.3.0. So it is required to keep compatibility if possible, e.g. adding branches for different Spark versions. +* Some companies still use old Spark versions, like 2.3.1. So it is required to keep compatibility if possible, e.g. adding branches for different Spark versions. * Different users uses onETL in different ways - some uses only DB connectors, some only files. Connector-specific dependencies should be optional. * Instead of creating classes with a lot of different options, prefer splitting them into smaller classes, e.g. options class, context manager, etc, and using composition. @@ -71,7 +71,7 @@ Create virtualenv and install dependencies: -r requirements/tests/postgres.txt \ -r requirements/tests/oracle.txt \ -r requirements/tests/pydantic-2.txt \ - -r requirements/tests/spark-3.5.0.txt + -r requirements/tests/spark-3.5.1.txt # TODO: remove after https://github.com/zqmillet/sphinx-plantuml/pull/4 pip install sphinx-plantuml --no-deps diff --git a/README.rst b/README.rst index 625112b2c..da0b84cb6 100644 --- a/README.rst +++ b/README.rst @@ -187,17 +187,17 @@ Compatibility matrix +--------------------------------------------------------------+-------------+-------------+-------+ | Spark | Python | Java | Scala | +==============================================================+=============+=============+=======+ -| `2.3.x `_ | 3.7 only | 8 only | 2.11 | +| `2.3.x `_ | 3.7 only | 8 only | 2.11 | +--------------------------------------------------------------+-------------+-------------+-------+ | `2.4.x `_ | 3.7 only | 8 only | 2.11 | +--------------------------------------------------------------+-------------+-------------+-------+ | `3.2.x `_ | 3.7 - 3.10 | 8u201 - 11 | 2.12 | +--------------------------------------------------------------+-------------+-------------+-------+ -| `3.3.x `_ | 3.7 - 3.10 | 8u201 - 17 | 2.12 | +| `3.3.x `_ | 3.7 - 3.10 | 8u201 - 17 | 2.12 | +--------------------------------------------------------------+-------------+-------------+-------+ -| `3.4.x `_ | 3.7 - 3.12 | 8u362 - 20 | 2.12 | +| `3.4.x `_ | 3.7 - 3.12 | 8u362 - 20 | 2.12 | +--------------------------------------------------------------+-------------+-------------+-------+ -| `3.5.x `_ | 3.8 - 3.12 | 8u371 - 20 | 2.12 | +| `3.5.x `_ | 3.8 - 3.12 | 8u371 - 20 | 2.12 | +--------------------------------------------------------------+-------------+-------------+-------+ .. _pyspark-install: @@ -212,7 +212,7 @@ or install PySpark explicitly: .. code:: bash - pip install onetl pyspark==3.5.0 # install a specific PySpark version + pip install onetl pyspark==3.5.1 # install a specific PySpark version or inject PySpark to ``sys.path`` in some other way BEFORE creating a class instance. **Otherwise connection object cannot be created.** @@ -553,7 +553,7 @@ Read files directly from S3 path, convert them to dataframe, transform it and th setup_logging() # Initialize new SparkSession with Hadoop AWS libraries and Postgres driver loaded - maven_packages = SparkS3.get_packages(spark_version="3.5.0") + Postgres.get_packages() + maven_packages = SparkS3.get_packages(spark_version="3.5.1") + Postgres.get_packages() spark = ( SparkSession.builder.appName("spark_app_onetl_demo") .config("spark.jars.packages", ",".join(maven_packages)) diff --git a/docker-compose.yml b/docker-compose.yml index 6ba2aca64..54b2af91d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -9,7 +9,7 @@ services: context: . target: base args: - SPARK_VERSION: 3.5.0 + SPARK_VERSION: 3.5.1 env_file: .env.docker volumes: - ./:/app/ @@ -173,7 +173,7 @@ services: - onetl samba: - image: elswork/samba + image: ${SAMBA_IMAGE:-elswork/samba} restart: unless-stopped ports: - "139:139" diff --git a/docker/Dockerfile b/docker/Dockerfile index 36cbb129f..d3d34ef21 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -42,10 +42,9 @@ USER onetl ENV PATH=${ONETL_USER_HOME}/.local/bin:${PATH} COPY --chown=onetl:onetl ./run_tests.sh ./pytest_runner.sh ./combine_coverage.sh /app/ -COPY --chown=onetl:onetl ./docker/wait-for-it.sh /app/docker/wait-for-it.sh -RUN chmod +x /app/run_tests.sh /app/pytest_runner.sh /app/combine_coverage.sh /app/docker/wait-for-it.sh +RUN chmod +x /app/run_tests.sh /app/pytest_runner.sh /app/combine_coverage.sh -ARG SPARK_VERSION=3.5.0 +ARG SPARK_VERSION=3.5.1 # Spark is heavy, and version change is quite rare COPY --chown=onetl:onetl ./requirements/tests/spark-${SPARK_VERSION}.txt /app/requirements/tests/ RUN pip install -r /app/requirements/tests/spark-${SPARK_VERSION}.txt diff --git a/docker/wait-for-it.sh b/docker/wait-for-it.sh deleted file mode 100755 index 7410fa3a6..000000000 --- a/docker/wait-for-it.sh +++ /dev/null @@ -1,182 +0,0 @@ -#!/usr/bin/env bash -# Use this script to test if a given TCP host/port are available - -WAITFORIT_cmdname=${0##*/} - -echoerr() { if [[ $WAITFORIT_QUIET -ne 1 ]]; then echo "$@" 1>&2; fi } - -usage() -{ - cat << USAGE >&2 -Usage: - $WAITFORIT_cmdname host:port [-s] [-t timeout] [-- command args] - -h HOST | --host=HOST Host or IP under test - -p PORT | --port=PORT TCP port under test - Alternatively, you specify the host and port as host:port - -s | --strict Only execute subcommand if the test succeeds - -q | --quiet Don't output any status messages - -t TIMEOUT | --timeout=TIMEOUT - Timeout in seconds, zero for no timeout - -- COMMAND ARGS Execute command with args after the test finishes -USAGE - exit 1 -} - -wait_for() -{ - if [[ $WAITFORIT_TIMEOUT -gt 0 ]]; then - echoerr "$WAITFORIT_cmdname: waiting $WAITFORIT_TIMEOUT seconds for $WAITFORIT_HOST:$WAITFORIT_PORT" - else - echoerr "$WAITFORIT_cmdname: waiting for $WAITFORIT_HOST:$WAITFORIT_PORT without a timeout" - fi - WAITFORIT_start_ts=$(date +%s) - while : - do - if [[ $WAITFORIT_ISBUSY -eq 1 ]]; then - nc -z $WAITFORIT_HOST $WAITFORIT_PORT - WAITFORIT_result=$? - else - (echo > /dev/tcp/$WAITFORIT_HOST/$WAITFORIT_PORT) >/dev/null 2>&1 - WAITFORIT_result=$? - fi - if [[ $WAITFORIT_result -eq 0 ]]; then - WAITFORIT_end_ts=$(date +%s) - echoerr "$WAITFORIT_cmdname: $WAITFORIT_HOST:$WAITFORIT_PORT is available after $((WAITFORIT_end_ts - WAITFORIT_start_ts)) seconds" - break - fi - sleep 1 - done - return $WAITFORIT_result -} - -wait_for_wrapper() -{ - # In order to support SIGINT during timeout: http://unix.stackexchange.com/a/57692 - if [[ $WAITFORIT_QUIET -eq 1 ]]; then - timeout $WAITFORIT_BUSYTIMEFLAG $WAITFORIT_TIMEOUT $0 --quiet --child --host=$WAITFORIT_HOST --port=$WAITFORIT_PORT --timeout=$WAITFORIT_TIMEOUT & - else - timeout $WAITFORIT_BUSYTIMEFLAG $WAITFORIT_TIMEOUT $0 --child --host=$WAITFORIT_HOST --port=$WAITFORIT_PORT --timeout=$WAITFORIT_TIMEOUT & - fi - WAITFORIT_PID=$! - trap "kill -INT -$WAITFORIT_PID" INT - wait $WAITFORIT_PID - WAITFORIT_RESULT=$? - if [[ $WAITFORIT_RESULT -ne 0 ]]; then - echoerr "$WAITFORIT_cmdname: timeout occurred after waiting $WAITFORIT_TIMEOUT seconds for $WAITFORIT_HOST:$WAITFORIT_PORT" - fi - return $WAITFORIT_RESULT -} - -# process arguments -while [[ $# -gt 0 ]] -do - case "$1" in - *:* ) - WAITFORIT_hostport=(${1//:/ }) - WAITFORIT_HOST=${WAITFORIT_hostport[0]} - WAITFORIT_PORT=${WAITFORIT_hostport[1]} - shift 1 - ;; - --child) - WAITFORIT_CHILD=1 - shift 1 - ;; - -q | --quiet) - WAITFORIT_QUIET=1 - shift 1 - ;; - -s | --strict) - WAITFORIT_STRICT=1 - shift 1 - ;; - -h) - WAITFORIT_HOST="$2" - if [[ $WAITFORIT_HOST == "" ]]; then break; fi - shift 2 - ;; - --host=*) - WAITFORIT_HOST="${1#*=}" - shift 1 - ;; - -p) - WAITFORIT_PORT="$2" - if [[ $WAITFORIT_PORT == "" ]]; then break; fi - shift 2 - ;; - --port=*) - WAITFORIT_PORT="${1#*=}" - shift 1 - ;; - -t) - WAITFORIT_TIMEOUT="$2" - if [[ $WAITFORIT_TIMEOUT == "" ]]; then break; fi - shift 2 - ;; - --timeout=*) - WAITFORIT_TIMEOUT="${1#*=}" - shift 1 - ;; - --) - shift - WAITFORIT_CLI=("$@") - break - ;; - --help) - usage - ;; - *) - echoerr "Unknown argument: $1" - usage - ;; - esac -done - -if [[ "$WAITFORIT_HOST" == "" || "$WAITFORIT_PORT" == "" ]]; then - echoerr "Error: you need to provide a host and port to test." - usage -fi - -WAITFORIT_TIMEOUT=${WAITFORIT_TIMEOUT:-15} -WAITFORIT_STRICT=${WAITFORIT_STRICT:-0} -WAITFORIT_CHILD=${WAITFORIT_CHILD:-0} -WAITFORIT_QUIET=${WAITFORIT_QUIET:-0} - -# Check to see if timeout is from busybox? -WAITFORIT_TIMEOUT_PATH=$(type -p timeout) -WAITFORIT_TIMEOUT_PATH=$(realpath $WAITFORIT_TIMEOUT_PATH 2>/dev/null || readlink -f $WAITFORIT_TIMEOUT_PATH) - -WAITFORIT_BUSYTIMEFLAG="" -if [[ $WAITFORIT_TIMEOUT_PATH =~ "busybox" ]]; then - WAITFORIT_ISBUSY=1 - # Check if busybox timeout uses -t flag - # (recent Alpine versions don't support -t anymore) - if timeout &>/dev/stdout | grep -q -e '-t '; then - WAITFORIT_BUSYTIMEFLAG="-t" - fi -else - WAITFORIT_ISBUSY=0 -fi - -if [[ $WAITFORIT_CHILD -gt 0 ]]; then - wait_for - WAITFORIT_RESULT=$? - exit $WAITFORIT_RESULT -else - if [[ $WAITFORIT_TIMEOUT -gt 0 ]]; then - wait_for_wrapper - WAITFORIT_RESULT=$? - else - wait_for - WAITFORIT_RESULT=$? - fi -fi - -if [[ $WAITFORIT_CLI != "" ]]; then - if [[ $WAITFORIT_RESULT -ne 0 && $WAITFORIT_STRICT -eq 1 ]]; then - echoerr "$WAITFORIT_cmdname: strict mode, refusing to execute subprocess" - exit $WAITFORIT_RESULT - fi - exec "${WAITFORIT_CLI[@]}" -else - exit $WAITFORIT_RESULT -fi diff --git a/requirements/tests/spark-3.3.3.txt b/requirements/tests/spark-3.3.4.txt similarity index 80% rename from requirements/tests/spark-3.3.3.txt rename to requirements/tests/spark-3.3.4.txt index 259340bf6..55629ed65 100644 --- a/requirements/tests/spark-3.3.3.txt +++ b/requirements/tests/spark-3.3.4.txt @@ -1,5 +1,5 @@ numpy>=1.16,<1.24 pandas>=1.0,<2 pyarrow>=1.0 -pyspark==3.3.3 +pyspark==3.3.4 sqlalchemy<2.0 diff --git a/requirements/tests/spark-3.4.2.txt b/requirements/tests/spark-3.4.3.txt similarity index 76% rename from requirements/tests/spark-3.4.2.txt rename to requirements/tests/spark-3.4.3.txt index c7173637d..5ea738d58 100644 --- a/requirements/tests/spark-3.4.2.txt +++ b/requirements/tests/spark-3.4.3.txt @@ -1,5 +1,5 @@ numpy>=1.16 pandas>=1.0 pyarrow>=1.0 -pyspark==3.4.2 +pyspark==3.4.3 sqlalchemy diff --git a/requirements/tests/spark-3.5.1.txt b/requirements/tests/spark-3.5.1.txt new file mode 100644 index 000000000..d1e812f7a --- /dev/null +++ b/requirements/tests/spark-3.5.1.txt @@ -0,0 +1,5 @@ +numpy>=1.16 +pandas>=1.0 +pyarrow>=1.0 +pyspark==3.5.1 +sqlalchemy diff --git a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_clickhouse_reader_integration.py b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_clickhouse_reader_integration.py index e38de7413..72314b5b3 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_clickhouse_reader_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_clickhouse_reader_integration.py @@ -155,6 +155,7 @@ def test_clickhouse_reader_snapshot_with_columns(spark, processing, load_table_d assert count_df.collect()[0][0] == table_df.count() +@pytest.mark.xfail(reason="Clickhouse <24 deduplicated column names, but 24+ does not") def test_clickhouse_reader_snapshot_with_columns_duplicated(spark, processing, prepare_schema_table): clickhouse = Clickhouse( host=processing.host, @@ -180,9 +181,9 @@ def test_clickhouse_reader_snapshot_with_columns_duplicated(spark, processing, p ], ) - # Clickhouse can detect that column is already a part of * and does not produce duplicates - df2 = reader2.run() - assert df1.columns == df2.columns + with pytest.raises(Exception, match="The column `id_int` already exists"): + df2 = reader2.run() + assert df1.columns == df2.columns def test_clickhouse_reader_snapshot_with_columns_mixed_naming(spark, processing, get_schema_table): diff --git a/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py b/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py index c786b1fe0..18047d749 100644 --- a/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py +++ b/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py @@ -250,7 +250,7 @@ def table_finalizer(): assert not clickhouse.fetch(f"SELECT * FROM {temp_table}{suffix}").count() -@pytest.mark.xfail(reason="Clickhouse 20.7 doesn't support functions") +@pytest.mark.xfail(reason="CREATE FUNCTION is not supported in Clickhouse < 21.20") @pytest.mark.parametrize("suffix", ["", ";"]) def test_clickhouse_connection_execute_function( request, diff --git a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py index 77ec071b3..e739655c8 100644 --- a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py +++ b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py @@ -212,7 +212,7 @@ def test_clickhouse_strategy_incremental_nothing_to_read(spark, processing, prep [ ("float_value", ValueError, "Expression 'float_value' returned values"), ("text_string", RuntimeError, "Cannot detect HWM type for"), - ("unknown_column", Exception, "Missing columns"), + ("unknown_column", Exception, "(Missing columns|Unknown expression).*"), ], ) def test_clickhouse_strategy_incremental_wrong_hwm(