diff --git a/.env b/.env index e0b358c..8206044 100644 --- a/.env +++ b/.env @@ -1,7 +1,7 @@ -CHADWICK_VERSION=v0.9.3 -BASEBALLDATABANK_VERSION=dd1a4503b9d6ec2bdda5e345ba06c867e368dd13 -RETROSHEET_VERSION=e540755f22b65d2f85f4da9180d1a31754c331f9 +CHADWICK_VERSION=v0.9.5 +BASEBALLDATABANK_VERSION=ccb3cef05e68f0085db4ada6d4a9ebab9435b452 +RETROSHEET_VERSION=48334a58f7446d59746d81aa73c3e9fa9b2676e9 EXTRACT_DIR=extract REPO=doublewick/boxball -VERSION=2022.0.0 +VERSION=2023.0.0 diff --git a/docker-compose.yml b/docker-compose.yml index 8e7f632..38ba960 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -42,6 +42,10 @@ x-clickhouse: &clickhouse build: context: load/clickhouse + dockerfile: ../Dockerfile + target: clickhouse + platforms: + - "linux/amd64" args: - VERSION image: ${REPO}:clickhouse-${VERSION} @@ -55,6 +59,10 @@ x-drill: &drill build: context: load/drill + dockerfile: ../Dockerfile + target: drill + platforms: + - "linux/amd64" args: - VERSION image: ${REPO}:drill-${VERSION} @@ -68,6 +76,10 @@ x-postgres: &postgres build: context: load/postgres + dockerfile: ../Dockerfile + target: postgres + platforms: + - "linux/amd64" args: - VERSION image: ${REPO}:postgres-${VERSION} @@ -81,6 +93,10 @@ x-postgres-cstore-fdw: &postgres-cstore-fdw build: context: load/postgres_cstore_fdw + dockerfile: ../Dockerfile + target: postgres-cstore-fdw + platforms: + - "linux/amd64" args: - VERSION image: ${REPO}:postgres-cstore-fdw-${VERSION} @@ -94,6 +110,10 @@ x-mysql: &mysql build: context: load/mysql + dockerfile: ../Dockerfile + target: mysql + platforms: + - "linux/amd64" args: - VERSION image: ${REPO}:mysql-${VERSION} @@ -107,6 +127,10 @@ x-sqlite: &sqlite build: context: load/sqlite + dockerfile: ../Dockerfile + target: sqlite + platforms: + - "linux/amd64" args: - VERSION image: ${REPO}:sqlite-${VERSION} diff --git a/extract/Dockerfile b/extract/Dockerfile index c8ea14c..9191fa3 100644 --- a/extract/Dockerfile +++ b/extract/Dockerfile @@ -2,7 +2,7 @@ ARG BUILD_ENV ARG RETROSHEET_IMAGE=get-retrosheet-${BUILD_ENV} ARG BASEBALLDATABANK_IMAGE=get-baseballdatabank-${BUILD_ENV} -FROM python:3.10.4-alpine3.15 AS build-common +FROM python:3.11-alpine3.17 AS build-common RUN apk add --no-cache \ parallel \ libtool \ diff --git a/load/Dockerfile b/load/Dockerfile new file mode 100644 index 0000000..30bff86 --- /dev/null +++ b/load/Dockerfile @@ -0,0 +1,69 @@ +ARG VERSION +FROM doublewick/boxball:ddl-${VERSION} as ddl + +FROM yandex/clickhouse-server:22.9.7.34 as clickhouse +COPY z_load.sh /docker-entrypoint-initdb.d/ +COPY --chown=clickhouse:clickhouse --from=ddl /ddl/clickhouse.sql /docker-entrypoint-initdb.d/ +COPY --chown=clickhouse:clickhouse --from=parquet /transform/parquet /data + +FROM drill/apache-drill:1.17.0 as drill +COPY --from=parquet /transform/parquet /data + +FROM mysql:8.0.31-debian as mysql +ENV MYSQL_ALLOW_EMPTY_PASSWORD=yes +COPY my.cnf /etc/mysql/conf.d/ +COPY A_unzip_csvs.sh z_remove_csvs.sh /docker-entrypoint-initdb.d/ +RUN apt-get update && apt-get install -y --no-install-recommends zstd zip && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* +COPY --chown=mysql:mysql --from=ddl /ddl/mysql.sql /docker-entrypoint-initdb.d/ +COPY --chown=mysql:mysql --from=csv /transform/csv /data + +FROM postgres:15.1 as postgres +RUN apt-get update && apt-get install -y --no-install-recommends zstd zip && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* +COPY A_build_conf.sql z_run_conf.sql /docker-entrypoint-initdb.d/ +COPY --chown=postgres:postgres --from=ddl /ddl/postgres.sql /docker-entrypoint-initdb.d/ +COPY --chown=postgres:postgres --from=csv /transform/csv /data + +FROM postgres:13.2 as postgres-cstore-fdw-build +RUN apt-get update && apt-get install -y --no-install-recommends postgresql-server-dev-13 build-essential zstd libprotobuf-c-dev protobuf-c-compiler wget ca-certificates unzip make gcc libpq-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* +RUN wget https://github.com/citusdata/cstore_fdw/archive/master.zip -O cstore_fdw.zip && \ + unzip cstore_fdw.zip && \ + mv cstore_fdw-master cstore_fdw +WORKDIR /cstore_fdw +RUN make && \ + make install +WORKDIR / +RUN echo "shared_preload_libraries = 'cstore_fdw'" >> "${PGDATA}/postgresql.conf" +COPY --chown=postgres:postgres --from=ddl /ddl/postgres_cstore_fdw.sql /docker-entrypoint-initdb.d/ +COPY --chown=postgres:postgres --from=csv /transform/csv /data +RUN cat /docker-entrypoint-initdb.d/postgres_cstore_fdw.sql + +FROM postgres-cstore-fdw-build as postgres-cstore-fdw + +FROM alpine:3.17 as sqlite-build +RUN apk add --no-cache \ + zstd \ + sqlite +RUN sqlite3 boxball.db ".databases" +COPY --from=ddl /ddl/sqlite.sql . +COPY --from=csv /transform/csv /data +RUN echo "Decompressing fies..." && \ + for f in /data/**/*.csv.zst; do zstd --rm -d ${f}; done && \ + echo "Building db..." && \ + < sqlite.sql sqlite3 -bail -echo boxball.db && \ + rm -rf /data && \ + zstd --rm boxball.db + + +FROM python:3.11-alpine3.17 AS sqlite +RUN apk add --no-cache \ + zstd \ + sqlite +RUN pip install sqlite-web==0.4.1 +COPY --from=build boxball.db.zst /tmp/ +ENTRYPOINT zstd --rm -d /tmp/boxball.db.zst -fo /db/boxball.db && sqlite_web -H 0.0.0.0 -x /db/boxball.db diff --git a/load/clickhouse/Dockerfile b/load/clickhouse/Dockerfile deleted file mode 100644 index 08bb69b..0000000 --- a/load/clickhouse/Dockerfile +++ /dev/null @@ -1,8 +0,0 @@ -ARG VERSION -FROM doublewick/boxball:ddl-${VERSION} as ddl -FROM doublewick/boxball:parquet-${VERSION} as parquet - -FROM yandex/clickhouse-server:21.3.2.5 -COPY z_load.sh /docker-entrypoint-initdb.d/ -COPY --chown=clickhouse:clickhouse --from=ddl /ddl/clickhouse.sql /docker-entrypoint-initdb.d/ -COPY --chown=clickhouse:clickhouse --from=parquet /transform/parquet /data diff --git a/load/drill/Dockerfile b/load/drill/Dockerfile deleted file mode 100644 index c07d786..0000000 --- a/load/drill/Dockerfile +++ /dev/null @@ -1,5 +0,0 @@ -ARG VERSION -FROM doublewick/boxball:parquet-${VERSION} as parquet - -FROM drill/apache-drill:1.17.0 -COPY --from=parquet /transform/parquet /data diff --git a/load/mysql/Dockerfile b/load/mysql/Dockerfile deleted file mode 100644 index 22d39ca..0000000 --- a/load/mysql/Dockerfile +++ /dev/null @@ -1,13 +0,0 @@ -ARG VERSION -FROM doublewick/boxball:ddl-${VERSION} as ddl -FROM doublewick/boxball:csv-${VERSION} as csv - -FROM mysql:8.0.28-debian as mysql-build -ENV MYSQL_ALLOW_EMPTY_PASSWORD=yes -COPY my.cnf /etc/mysql/conf.d/ -COPY A_unzip_csvs.sh z_remove_csvs.sh /docker-entrypoint-initdb.d/ -RUN apt-get update && apt-get install -y --no-install-recommends zstd && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* -COPY --chown=mysql:mysql --from=ddl /ddl/mysql.sql /docker-entrypoint-initdb.d/ -COPY --chown=mysql:mysql --from=csv /transform/csv /data diff --git a/load/postgres/Dockerfile b/load/postgres/Dockerfile deleted file mode 100644 index 3e9aad7..0000000 --- a/load/postgres/Dockerfile +++ /dev/null @@ -1,11 +0,0 @@ -ARG VERSION -FROM doublewick/boxball:ddl-${VERSION} as ddl -FROM doublewick/boxball:csv-${VERSION} as csv - -FROM postgres:13.2 -RUN apt-get update && apt-get install -y --no-install-recommends zstd && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* -COPY A_build_conf.sql z_run_conf.sql /docker-entrypoint-initdb.d/ -COPY --chown=postgres:postgres --from=ddl /ddl/postgres.sql /docker-entrypoint-initdb.d/ -COPY --chown=postgres:postgres --from=csv /transform/csv /data diff --git a/load/sqlite/Dockerfile b/load/sqlite/Dockerfile deleted file mode 100644 index 96a1d20..0000000 --- a/load/sqlite/Dockerfile +++ /dev/null @@ -1,26 +0,0 @@ -ARG VERSION -FROM doublewick/boxball:ddl-${VERSION} as ddl -FROM doublewick/boxball:csv-${VERSION} as csv - -FROM alpine:3.9.3 as build -RUN apk add --no-cache \ - zstd \ - sqlite -RUN sqlite3 boxball.db ".databases" -COPY --from=ddl /ddl/sqlite.sql . -COPY --from=csv /transform/csv /data -RUN echo "Decompressing fies..." && \ - for f in /data/**/*.csv.zst; do zstd --rm -d ${f}; done && \ - echo "Building db..." && \ - < sqlite.sql sqlite3 -bail -echo boxball.db && \ - rm -rf /data && \ - zstd --rm boxball.db - - -FROM python:3.7.3-alpine3.9 -RUN apk add --no-cache \ - zstd \ - sqlite -RUN pip install sqlite-web==0.3.7 -COPY --from=build boxball.db.zst /tmp/ -ENTRYPOINT zstd --rm -d /tmp/boxball.db.zst -fo /db/boxball.db && sqlite_web -H 0.0.0.0 -x /db/boxball.db diff --git a/requirements.txt b/requirements.txt index 36c352a..630f045 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ -pyhumps==1.6.1 -zstandard==0.15.2 -SQLAlchemy==1.3.23 +pyhumps==3.8.0 +zstandard==0.19.0 +SQLAlchemy==1.4.45 sqlalchemy-fdw==0.3.0 -clickhouse-sqlalchemy==0.1.5 -pyarrow==3.0.0 -pytest==6.2.2 -pytest-cov==2.11.1 +clickhouse-sqlalchemy==0.2.3 +pyarrow==10.0.1 +pytest==7.2.0 +pytest-cov==4.0.0 codacy-coverage==1.3.11 \ No newline at end of file diff --git a/transform/ddl.Dockerfile b/transform/ddl.Dockerfile index 1725add..9c00d56 100644 --- a/transform/ddl.Dockerfile +++ b/transform/ddl.Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.10-slim-bullseye AS build-common +FROM python:3.11-slim-bullseye AS build-common COPY requirements.txt . RUN pip install -r requirements.txt ENV PYTHONPATH="/" diff --git a/transform/parquet.Dockerfile b/transform/parquet.Dockerfile index c819da8..b64c4b2 100644 --- a/transform/parquet.Dockerfile +++ b/transform/parquet.Dockerfile @@ -1,7 +1,7 @@ ARG VERSION FROM doublewick/boxball:extract-${VERSION} as extract -FROM python:3.10-slim-bullseye AS build-common +FROM python:3.11-slim-bullseye AS build-common COPY requirements.txt . RUN pip install -r requirements.txt ENV PYTHONPATH="/" diff --git a/transform/requirements.txt b/transform/requirements.txt index 98fc2c0..b327a30 100644 --- a/transform/requirements.txt +++ b/transform/requirements.txt @@ -1,5 +1,5 @@ SQLAlchemy==1.3.23 sqlalchemy-fdw==0.3.0 clickhouse-sqlalchemy==0.1.5 -pyarrow==7.0.0 +pyarrow==10.0.1 zstandard==0.17.0 diff --git a/transform/src/parquet.py b/transform/src/parquet.py index 6e15fb9..5f3ea1a 100644 --- a/transform/src/parquet.py +++ b/transform/src/parquet.py @@ -16,7 +16,7 @@ # How many bytes in each CSV chunk to bring into memory. # Larger sizes result in better compression and slightly faster time, # but don't want to risk OOM issues on small build boxes. -BUFFER_SIZE_BYTES = 1000000000 +BUFFER_SIZE_BYTES = 500000000 sql_type_lookup: Dict[Type[TypeEngine], str] = { Integer: 'int32', @@ -57,7 +57,7 @@ def get_path(prefix: Path, suffix: str): arrow_schema = pa.schema(get_fields(table)) column_names = [name for name, dtype in get_fields(table)] - read_options = pcsv.ReadOptions(column_names=column_names, block_size=1000000000) + read_options = pcsv.ReadOptions(column_names=column_names, block_size=BUFFER_SIZE_BYTES) parse_options = pcsv.ParseOptions(newlines_in_values=True) convert_options = pcsv.ConvertOptions(column_types=arrow_schema, timestamp_parsers=["%Y%m%d", "%Y-%m-%d"], true_values=["1", "T"], false_values=["0", "F"], strings_can_be_null=True) diff --git a/transform/src/schemas/baseballdatabank.py b/transform/src/schemas/baseballdatabank.py index c1b7dcc..0e79eda 100644 --- a/transform/src/schemas/baseballdatabank.py +++ b/transform/src/schemas/baseballdatabank.py @@ -246,9 +246,10 @@ class HallOfFame(Base): player_id = Column(String(10), primary_key=True, nullable=False) year_id = Column(SmallInteger, primary_key=True, nullable=False) voted_by = Column(String(64), primary_key=True, nullable=False) - ballots = Column(SmallInteger) - needed = Column(SmallInteger) - votes = Column(SmallInteger) + # The 3 below are actually ints but there are some irregular nulls + ballots = Column(String(64)) + needed = Column(String(64)) + votes = Column(String(64)) inducted = Column(String(1)) category = Column(String(20)) needed_note = Column(String(25)) diff --git a/transform/src/schemas/retrosheet.py b/transform/src/schemas/retrosheet.py index 88c4632..11a6b33 100644 --- a/transform/src/schemas/retrosheet.py +++ b/transform/src/schemas/retrosheet.py @@ -77,7 +77,8 @@ class Roster(Base): bats = Column(CHAR(1), doc="Bat handedness") throws = Column(CHAR(1), doc="Throw handedness") team_id = Column(CHAR(3), primary_key=True, doc="Team ID") - position = Column(String(2), doc="Primary fielding position") + # TODO: Remove duplicate roster entry(s) + position = Column(String(2), primary_key=True, doc="Primary fielding position") class Schedule(Base):