Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
kupferk committed Mar 23, 2021
2 parents f0dab8d + 852c5c5 commit 35b3cd8
Show file tree
Hide file tree
Showing 606 changed files with 23,479 additions and 8,579 deletions.
149 changes: 149 additions & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
stages:
- build

variables:
# This will suppress any download for dependencies and plugins or upload messages which would clutter the console log.
# `showDateTime` will show the passed time in milliseconds. You need to specify `--batch-mode` to make this work.
MAVEN_OPTS: "-Dhttp.proxyHost=${http_proxy_host} -Dhttp.proxyPort=${http_proxy_port} -Dhttps.proxyHost=${http_proxy_host} -Dhttps.proxyPort=${http_proxy_port} -Dhttps.protocols=TLSv1.2 -Dmaven.repo.local=$CI_PROJECT_DIR/.m2/repository -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=WARN -Dorg.slf4j.simpleLogger.showDateTime=true -Djava.awt.headless=true -XX:ReservedCodeCacheSize=512m"
# As of Maven 3.3.0 instead of this you may define these options in `.mvn/maven.config` so the same config is used
# when running from the command line.
# `installAtEnd` and `deployAtEnd` are only effective with recent version of the corresponding plugins.
MAVEN_CLI_OPTS: "--batch-mode --errors --fail-at-end --show-version -DinstallAtEnd=true -DdeployAtEnd=true"

image: dimajix/maven-npm:latest

# Cache downloaded dependencies and plugins between builds.
cache:
key: flowman-${CI_JOB_NAME}
paths:
- .m2/repository
- .npm


# Build site and reports
build-site:
stage: build
script: 'mvn ${MAVEN_CLI_OPTS} clean install -DskipTests && mvn ${MAVEN_CLI_OPTS} site'
artifacts:
name: "flowman-site"
paths:
- target/site
- flowman-*/target/site
- flowman-plugins/*/target/site
expire_in: 5 days


# Default build variant
build-default:
stage: build
script: 'mvn ${MAVEN_CLI_OPTS} clean package'
except:
- pushes
artifacts:
name: "flowman-dist-default"
paths:
- flowman-dist/target/flowman-dist-*-bin.tar.gz
expire_in: 5 days

# List additional build variants (some of them will be built on pushes)
build-hadoop2.6-spark2.3:
stage: build
script: 'mvn ${MAVEN_CLI_OPTS} clean package -Phadoop-2.6 -Pspark-2.3'
artifacts:
name: "flowman-dist-hadoop2.6-spark2.3"
paths:
- flowman-dist/target/flowman-dist-*-bin.tar.gz
expire_in: 5 days

build-hadoop2.6-spark2.4:
stage: build
except:
- pushes
script: 'mvn ${MAVEN_CLI_OPTS} clean package -Phadoop-2.6 -Pspark-2.4'
artifacts:
name: "flowman-dist-hadoop2.6-spark2.4"
paths:
- flowman-dist/target/flowman-dist-*-bin.tar.gz
expire_in: 5 days

build-hadoop2.9-spark2.4:
stage: build
script: 'mvn ${MAVEN_CLI_OPTS} clean package -Phadoop-2.9 -Pspark-2.4'
artifacts:
name: "flowman-dist-hadoop2.9-spark2.4"
paths:
- flowman-dist/target/flowman-dist-*-bin.tar.gz
expire_in: 5 days

build-hadoop3.1-spark2.4:
stage: build
except:
- pushes
script: 'mvn ${MAVEN_CLI_OPTS} clean package -Phadoop-3.1 -Pspark-2.4'
artifacts:
name: "flowman-dist-hadoop3.1-spark2.4"
paths:
- flowman-dist/target/flowman-dist-*-bin.tar.gz
expire_in: 5 days

build-hadoop2.9-spark3.0:
stage: build
except:
- pushes
script: 'mvn ${MAVEN_CLI_OPTS} clean package -Phadoop-2.9 -Pspark-3.0'
artifacts:
name: "flowman-dist-hadoop2.9-spark3.0"
paths:
- flowman-dist/target/flowman-dist-*-bin.tar.gz
expire_in: 5 days

build-hadoop3.1-spark3.0:
stage: build
except:
- pushes
script: 'mvn ${MAVEN_CLI_OPTS} clean package -Phadoop-3.1 -Pspark-3.0'
artifacts:
name: "flowman-dist-hadoop3.1-spark3.0"
paths:
- flowman-dist/target/flowman-dist-*-bin.tar.gz
expire_in: 5 days

build-hadoop3.2-spark3.0:
stage: build
script: 'mvn ${MAVEN_CLI_OPTS} clean package -Phadoop-3.2 -Pspark-3.0'
artifacts:
name: "flowman-dist-hadoop3.2-spark3.0"
paths:
- flowman-dist/target/flowman-dist-*-bin.tar.gz
expire_in: 5 days

build-hadoop3.2-spark3.1:
stage: build
script: 'mvn ${MAVEN_CLI_OPTS} clean package -Phadoop-3.2 -Pspark-3.1'
artifacts:
name: "flowman-dist-hadoop3.2-spark3.1"
paths:
- flowman-dist/target/flowman-dist-*-bin.tar.gz
expire_in: 5 days

build-cdh5.15:
stage: build
except:
- pushes
script: 'mvn ${MAVEN_CLI_OPTS} clean package -PCDH-5.15'
artifacts:
name: "flowman-dist-cdh5.15"
paths:
- flowman-dist/target/flowman-dist-*-bin.tar.gz
expire_in: 5 days

build-cdh6.3:
stage: build
except:
- pushes
script: 'mvn ${MAVEN_CLI_OPTS} clean package -PCDH-6.3'
artifacts:
name: "flowman-dist-cdh6.3"
paths:
- flowman-dist/target/flowman-dist-*-bin.tar.gz
expire_in: 5 days
4 changes: 4 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ jobs:
jdk: openjdk8
script: mvn clean install -Phadoop-3.2 -Pspark-3.0

- name: Hadoop 3.2 with Spark 3.1
jdk: openjdk8
script: mvn clean install -Phadoop-3.2 -Pspark-3.1

- name: CDH 5.15
jdk: openjdk8
script: mvn clean install -PCDH-5.15
Expand Down
118 changes: 79 additions & 39 deletions BUILDING.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ is installed on the build machine.

Building Flowman with the default settings (i.e. Hadoop and Spark version) is as easy as

mvn clean install
```shell
mvn clean install
```

## Main Artifacts

Expand All @@ -28,11 +30,15 @@ You should also configure git such that all files are checked out using "LF" end
some unittests may fail and Docker images might not be useable. This can be done by setting the git configuration
value "core.autocrlf" to "input"

git config --global core.autocrlf input

```shell
git config --global core.autocrlf input
```

You might also want to skip unittests (the HBase plugin is currently failing under windows)

mvn clean install -DskipTests
```shell
mvn clean install -DskipTests
```

It may well be the case that some unittests fail on Windows - don't panic, we focus on Linux systems and ensure that
the `master` branch really builds clean with all unittests passing on Linux.
Expand All @@ -42,20 +48,25 @@ the `master` branch really builds clean with all unittests passing on Linux.

Per default, Flowman will be built for fairly recent versions of Spark (2.4.5 as of this writing) and Hadoop (2.8.5).
But of course you can also build for a different version by either using a profile

mvn install -Pspark2.3 -Phadoop2.7 -DskipTests


```shell
mvn install -Pspark2.3 -Phadoop2.7 -DskipTests
```

This will always select the latest bugfix version within the minor version. You can also specify versions explicitly
as follows:

mvn install -Dspark.version=2.2.1 -Dhadoop.version=2.7.3
```shell
mvn install -Dspark.version=2.2.1 -Dhadoop.version=2.7.3
```
Note that using profiles is the preferred way, as this guarantees that also dependencies are selected
using the correct version. The following profiles are available:

* spark-2.3
* spark-2.4
* spark-3.0
* spark-3.1
* hadoop-2.6
* hadoop-2.7
* hadoop-2.8
Expand All @@ -69,63 +80,92 @@ With these profiles it is easy to build Flowman to match your environment.

## Building for Open Source Hadoop and Spark

Spark 2.3 and Hadoop 2.6:
### Spark 2.3 and Hadoop 2.6:

mvn clean install -Pspark-2.3 -Phadoop-2.6

Spark 2.3 and Hadoop 2.7:

mvn clean install -Pspark-2.3 -Phadoop-2.7
```shell
mvn clean install -Pspark-2.3 -Phadoop-2.6
```

Spark 2.3 and Hadoop 2.8:
### Spark 2.3 and Hadoop 2.7:

mvn clean install -Pspark-2.3 -Phadoop-2.8
```shell
mvn clean install -Pspark-2.3 -Phadoop-2.7
```

Spark 2.3 and Hadoop 2.9:
### Spark 2.3 and Hadoop 2.8:

mvn clean install -Pspark-2.3 -Phadoop-2.9
```shell
mvn clean install -Pspark-2.3 -Phadoop-2.8
```

Spark 2.4 and Hadoop 2.6:
### Spark 2.3 and Hadoop 2.9:

mvn clean install -Pspark-2.4 -Phadoop-2.6
Spark 2.4 and Hadoop 2.7:
```shell
mvn clean install -Pspark-2.3 -Phadoop-2.9
```

mvn clean install -Pspark-2.4 -Phadoop-2.7
### Spark 2.4 and Hadoop 2.6:

Spark 2.4 and Hadoop 2.8:
```shell
mvn clean install -Pspark-2.4 -Phadoop-2.6
```

mvn clean install -Pspark-2.4 -Phadoop-2.8
### Spark 2.4 and Hadoop 2.7:

Spark 2.4 and Hadoop 2.9:
```shell
mvn clean install -Pspark-2.4 -Phadoop-2.7
```

mvn clean install -Pspark-2.4 -Phadoop-2.9
### Spark 2.4 and Hadoop 2.8:

Spark 3.0 and Hadoop 3.1
```shell
mvn clean install -Pspark-2.4 -Phadoop-2.8
```

mvn clean install -Pspark-3.0 -Phadoop-3.1
### Spark 2.4 and Hadoop 2.9:

Spark 3.0 and Hadoop 3.2
```shell
mvn clean install -Pspark-2.4 -Phadoop-2.9
```

mvn clean install -Pspark-3.0 -Phadoop-3.2
### Spark 3.0 and Hadoop 3.1

```shell
mvn clean install -Pspark-3.0 -Phadoop-3.1
```

### Spark 3.0 and Hadoop 3.2

```shell
mvn clean install -Pspark-3.0 -Phadoop-3.2
```

### Spark 3.1 and Hadoop 3.2

```shell
mvn clean install -Pspark-3.1 -Phadoop-3.2
```

## Building for Cloudera

The Maven project also contains preconfigured profiles for Cloudera.

mvn clean install -Pspark-2.3 -PCDH-5.15 -DskipTests
```shell
mvn clean install -Pspark-2.3 -PCDH-5.15 -DskipTests
```

Or for Cloudera 6.3

mvn clean install -Pspark-2.4 -PCDH-6.3 -DskipTests


## Skipping Docker Image
```shell
mvn clean install -Pspark-2.4 -PCDH-6.3 -DskipTests
```

Part of the build also is a Docker image. Since you might not want to use it, because you are using different base
images, you can skip the building of the Docker image via `-Ddockerfile.skip`
# Coverage Analysis
```shell
mvn scoverage:report
```

## Building Documentation
# Building Documentation

Flowman also contains Markdown documentation which is processed by Sphinx to generate the online HTML documentation.

Expand Down
36 changes: 35 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,38 @@
# Version 0.14.2
# Version 0.15.0 - 2021-03-23

* New configuration variable `floman.default.target.rebalance`
* New configuration variable `floman.default.target.parallelism`
* Changed behaviour: The `mergeFile` target now does not assume any more that the `target` is local. If you already
use `mergeFiles` with a local file, you need to prefix the target file name with `file://`.
* Add new `-t` argument for selectively building a subset of targets
* Remove example-plugin
* Add quickstart guide
* Add new "flowman-parent" BOM for projects using Flowman
* Move `com.dimajix.flowman.annotations` package to `com.dimajix.flowman.spec.annotations`
* Add new log redaction
* Integrate Scala scode coverage analysis
* `assemble` will fail when trying to use non-existing columns
* Move `swagger` and `json` schema support into separate plugins
* Change default build to Spark 3.0 and Hadoop 3.2
* Update Spark to 3.0.2
* Rename class `Executor` to `Execution` - watch your plugins!
* Implement new configurable `Executor` class for executing build targets.
* Add build profile for Spark 3.1.x
* Update ScalaTest to 3.2.5 - watch your unittests for changed ScalaTest API!
* Add new `case` mapping
* Add new `--dry-run` command line option
* Add new `mock` and `null` mapping types
* Add new `mock` relation
* Add new `values` mapping
* Add new `values` dataset
* Implement new testing capabilities
* Rename `update` mapping to `upsert` mapping, which better describes its functionality
* Introduce new `VALIDATE` phase, which is executed even before `CREATE` phase
* Implement new `validate` and `verify` targets
* Implement new `deptree` command in Flowman shell


# Version 0.14.2 - 2020-10-12

* Upgrade to Spark 2.4.7 and Spark 3.0.1
* Clean up dependencies
Expand Down
Loading

0 comments on commit 35b3cd8

Please sign in to comment.