diff --git a/.circleci/config.yml b/.circleci/config.yml index bcfc1b225..ae9cdeca5 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -181,24 +181,6 @@ jobs: command: | gcloud components install gke-gcloud-auth-plugin - - run: - name: Deploy astro - command: | - hubploy deploy --timeout 30m astro hub ${CIRCLE_BRANCH} - no_output_timeout: 30m - - - run: - name: Deploy biology - command: | - hubploy deploy --timeout 30m biology hub ${CIRCLE_BRANCH} - no_output_timeout: 30m - - - run: - name: Deploy cee - command: | - hubploy deploy --timeout 30m cee hub ${CIRCLE_BRANCH} - no_output_timeout: 30m - - run: name: Deploy data8 command: | @@ -311,33 +293,6 @@ workflows: version: 2 test-build-images: jobs: - - hubploy/build-image: - deployment: astro - name: astro image build - # Filters can only be per-job? wtf - filters: - branches: - ignore: - - staging - - prod - - hubploy/build-image: - deployment: biology - name: biology image build - # Filters can only be per-job? wtf - filters: - branches: - ignore: - - staging - - prod - - hubploy/build-image: - deployment: cee - name: cee image build - # Filters can only be per-job? wtf - filters: - branches: - ignore: - - staging - - prod - hubploy/build-image: deployment: data8 name: data8 image build @@ -451,33 +406,6 @@ workflows: deploy: jobs: - - hubploy/build-image: - deployment: astro - name: astro image build - push: true - # Filters can only be per-job? wtf - filters: - branches: - only: - - staging - - hubploy/build-image: - deployment: biology - name: biology image build - push: true - # Filters can only be per-job? wtf - filters: - branches: - only: - - staging - - hubploy/build-image: - deployment: cee - name: cee image build - push: true - # Filters can only be per-job? wtf - filters: - branches: - only: - - staging - hubploy/build-image: deployment: data8 name: data8 image build @@ -591,9 +519,6 @@ workflows: # CI level, we also make prod deploys go faster! - deploy: requires: - - astro image build - - biology image build - - cee image build - data8 image build - data100 image build - data102 image build diff --git a/.github/workflows/deploy-hubs.yaml b/.github/workflows/deploy-hubs.yaml index 84cf3b9a2..af3c1948f 100644 --- a/.github/workflows/deploy-hubs.yaml +++ b/.github/workflows/deploy-hubs.yaml @@ -103,7 +103,7 @@ jobs: echo "Deploying single-user image and hub config to ${deployment}" hubploy --verbose deploy --timeout 30m ${deployment} hub staging echo - done < <(python .github/scripts/determine-hub-deployments.py --only-deploy logodev stat159 stat20 nature a11y ugr01 data101) + done < <(python .github/scripts/determine-hub-deployments.py --only-deploy logodev stat159 stat20 nature a11y ugr01 data101) # astro biology cee) deploy-hubs-to-prod: if: github.event_name == 'push' && github.ref == 'refs/heads/prod' @@ -198,4 +198,4 @@ jobs: echo "Deploying single-user image and hub config to ${deployment}" hubploy --verbose deploy --timeout 30m ${deployment} hub prod echo - done < <(python .github/scripts/determine-hub-deployments.py --only-deploy logodev stat159 stat20 nature a11y ugr01 data101) + done < <(python .github/scripts/determine-hub-deployments.py --only-deploy logodev stat159 stat20 nature a11y ugr01 data101) # astro biology cee) diff --git a/deployments/astro/hubploy.yaml b/deployments/astro/hubploy.yaml index 43bb0b893..a82b84c7b 100644 --- a/deployments/astro/hubploy.yaml +++ b/deployments/astro/hubploy.yaml @@ -1,10 +1,5 @@ images: - image_name: us-central1-docker.pkg.dev/ucb-datahub-2018/user-images/astro-user-image - registry: - provider: gcloud - gcloud: - project: ucb-datahub-2018 - service_key: gcr-key.json + image_name: us-central1-docker.pkg.dev/ucb-datahub-2018/user-images/astro-user-image:placeholder cluster: provider: gcloud diff --git a/deployments/astro/image/README.md b/deployments/astro/image/README.md new file mode 100644 index 000000000..1184a6a58 --- /dev/null +++ b/deployments/astro/image/README.md @@ -0,0 +1,5 @@ +# Astro Image + +This image is now located [in its own repo](https://github.com/berkeley-dsep-infra/astro-user-image). + +Please see [the contribution guide](https://github.com/berkeley-dsep-infra/astro-user-image/blob/main/CONTRIBUTING.md) for instructions on how to propose changes to the image. diff --git a/deployments/astro/image/apt.txt b/deployments/astro/image/apt.txt deleted file mode 100644 index 422e2ccbc..000000000 --- a/deployments/astro/image/apt.txt +++ /dev/null @@ -1,30 +0,0 @@ -# installing less as more just isn't enough -less - -# For desktop environment -dbus-x11 -xfce4 -xfce4-panel -xfce4-session -xfce4-settings -xorg -xubuntu-icon-theme -# You definitely need a browser -firefox -# And a text editor -gedit -# And a terminal -xfce4-terminal - -# https://github.com/berkeley-dsep-infra/datahub/issues/2535 -emacs - -# vim4ever! -vim - -# For jupyter-tree-download. Ref: https://github.com/berkeley-dsep-infra/datahub/issues/3979 -zip - -# playwright deps https://jira-secure.berkeley.edu/browse/DH-305 -libnss3 -libnspr4 diff --git a/deployments/astro/image/environment.yml b/deployments/astro/image/environment.yml deleted file mode 100644 index af9024c5a..000000000 --- a/deployments/astro/image/environment.yml +++ /dev/null @@ -1,109 +0,0 @@ -name: astrods - -channels: -- conda-forge -- pytorch - -dependencies: -- python=3.11.* -- jupyter-server-proxy>=4.3 -# A linux desktop environment -- websockify -- numpy==1.26.4 -- numexpr>=2.8.7 -- matplotlib==3.9.* -- graphviz>=2.50.0 -- psycopg2==2.9.* -- pytorch -- torchvision -- pytorch-lightning -- cartopy -- pandas>=2.2.2 -- protobuf -- jupysql -- mpich -- mpi4py -- mkl -- mkl-service -- netcdf4>=1.6 -- scikit-learn==1.5.* -- scikit-image==0.24.* -- scipy==1.14.* -- tensorflow-cpu==2.17.0 -- tensorflow==2.17.0 -- cudatoolkit -- jax>=0.4.31 -- jaxlib>=0.4.31 -- cython==3.0.11 -- sqlite==3.46.0 -- pip - -- pip: - - -r infra-requirements.txt - - opencv-python>=4.5.5.62 - - nose2>=0.10.0 - - sympy - - beautifulsoup4 - - tqdm - - corner==2.2.* - - dask>=2024.8.1 - - dask-ml>=2024.4.4 - - distributed>=2024.8.1 - - SQLAlchemy>=2.0.32 - - xarray>=2024.7.0 - - nltk>=3.9.1 - - seaborn>=0.13.2 - - bqplot>=0.12.43 - - astroquery>=0.4.7 - - astropy>=6.1.2 - - dustmaps>=1.0.13 - - pyvo>=1.5.2 - - joblib==1.4.2 - - pymc>=5.16.2 - - requests>=2.32.3 - - ipycanvas - - altair>=5.4.0 - - vega3>=0.13.0 - - pytz>=2024.1 - - lightkurve - - ipyparallel - - line_profiler>=4.1.3 - - snakeviz>=2.2.0 - - memory_profiler>=0.61.0 - - flask>=3.0.3 - - stsci.tools>=4.1.0 - - gensim>=4.3.3 - - tweet-preprocessor - - pyLDAvis==3.4.1 - - umap-learn>=0.5.6 - - pydot>=3.0.1 - - TPOT>=0.12.2 - - tables>=3.10.1 - - aiohttp>=3.10.4 - - watermark - - autopep8 - - vega_datasets - - vega - - pandas-bokeh - - pythreejs - - ipywidgets - - ipyvolume - - urllib3 - - six - - ipython - - notebook - - click - - fire - - pycodestyle - - flake8 - - tdtax>=0.1.6 -# - nb_black - - pycodestyle_magic - - twine - - otter-grader>=3.1.4 - # for notebook exporting - - nbconvert[webpdf] - - nb2pdf==0.6.2 - - nbpdfexport==0.2.1 - - pytest-notebook==0.8.1 - - jupyter-tensorboard>=0.2.0 diff --git a/deployments/astro/image/infra-requirements.txt b/deployments/astro/image/infra-requirements.txt deleted file mode 100644 index 0fb0bd930..000000000 --- a/deployments/astro/image/infra-requirements.txt +++ /dev/null @@ -1,29 +0,0 @@ -# WARNING: Original source at scripts/infra-packages/requirements.txt -# PLEASE DO NOT EDIT ELSEWHERE -# After editing scripts/infra-packages/requirements.txt, please run -# scripts/infra-packages/sync.bash. - -# This file pins versions of notebook related python packages we want -# across all hubs. This makes sure we don't need to upgrade them -# everwhere one by one. - -# FIXME: Freeze this to get exact versions of all dependencies -notebook==7.0.7 -jupyterlab==4.0.11 -nbgitpuller==1.2.1 -jupyter-resource-usage==1.0.1 -# Matches version in images/hub/Dockerfile -jupyterhub==4.1.6 -appmode==0.8.0 -ipywidgets==8.0.7 -jupyter-tree-download==1.0.1 -git-credential-helpers==0.2 -# Measure popularity of different packages in our hubs -# https://discourse.jupyter.org/t/request-for-implementation-instrument-libraries-actively-used-by-users-on-a-jupyterhub/7994?u=yuvipanda -git+https://github.com/shaneknapp/python-popularity-contest.git@add-error-handling -# RISE is useful for presentations - see https://github.com/berkeley-dsep-infra/datahub/issues/2527 -RISE==5.7.1 -# syncthing for dropbox-like functionality -jupyter-syncthing-proxy==1.0.3 -# jupyter archival tool for easy user downloads -jupyter-archive==3.4.0 diff --git a/deployments/astro/image/postBuild b/deployments/astro/image/postBuild deleted file mode 100644 index b1b290345..000000000 --- a/deployments/astro/image/postBuild +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -# 2024-01-13 sknapp: incompatible due to notebook 7 -# jupyter contrib nbextensions install --sys-prefix --symlink -# jupyter nbextensions_configurator enable --sys-prefix - -# installing chromium browser to enable webpdf conversion using nbconvert -export PLAYWRIGHT_BROWSERS_PATH=${CONDA_DIR} -playwright install chromium diff --git a/deployments/astro/image/start b/deployments/astro/image/start deleted file mode 100644 index c3a978b7f..000000000 --- a/deployments/astro/image/start +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -# See https://jira-secure.berkeley.edu/browse/DH-305 -export PLAYWRIGHT_BROWSERS_PATH=${CONDA_DIR} -exec "$@" diff --git a/deployments/biology/hubploy.yaml b/deployments/biology/hubploy.yaml index 26ef5b965..8ac0aeabd 100644 --- a/deployments/biology/hubploy.yaml +++ b/deployments/biology/hubploy.yaml @@ -1,10 +1,5 @@ images: - image_name: us-central1-docker.pkg.dev/ucb-datahub-2018/user-images/biology-user-image - registry: - provider: gcloud - gcloud: - project: ucb-datahub-2018 - service_key: gcr-key.json + image_name: us-central1-docker.pkg.dev/ucb-datahub-2018/user-images/biology-user-image:placeholder cluster: provider: gcloud diff --git a/deployments/biology/image/Dockerfile b/deployments/biology/image/Dockerfile deleted file mode 100644 index 70680021a..000000000 --- a/deployments/biology/image/Dockerfile +++ /dev/null @@ -1,206 +0,0 @@ -FROM buildpack-deps:jammy-scm - -ENV TZ=America/Los_Angeles -RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone - -ENV LC_ALL en_US.UTF-8 -ENV LANG en_US.UTF-8 -ENV LANGUAGE en_US.UTF-8 -ENV DEBIAN_FRONTEND=noninteractive -ENV NB_USER jovyan -ENV NB_UID 1000 - -ENV CONDA_DIR /srv/conda -ENV R_LIBS_USER /srv/r - -# Explicitly add littler to PATH -# See https://github.com/conda-forge/r-littler-feedstock/issues/6 -ENV PATH ${CONDA_DIR}/lib/R/library/littler/bin:${CONDA_DIR}/bin:$PATH - -RUN adduser --disabled-password --gecos "Default Jupyter user" ${NB_USER} - -# Create user owned R libs dir -# This lets users temporarily install packages -RUN mkdir -p ${R_LIBS_USER} && chown ${NB_USER}:${NB_USER} ${R_LIBS_USER} - -# Required for PAUP* -# Note that this doesn't actually install python2, thankfully -RUN apt-get update -qq --yes > /dev/null && \ - apt-get install --yes -qq \ - libpython2.7 > /dev/null - -## library required for fast-PCA & https://github.com/DReichLab/EIG -RUN apt-get update -qq --yes && \ - apt-get install --yes --no-install-recommends -qq \ - libgsl-dev >/dev/null - -# Install these without 'recommended' packages to keep image smaller. -# Useful utils that folks sort of take for granted -RUN apt-get update -qq --yes && \ - apt-get install --yes --no-install-recommends -qq \ - less \ - htop \ - man \ - nano \ - screen \ - tar \ - tmux \ - wget \ - vim \ - tini \ - locales > /dev/null - -RUN echo "${LC_ALL} UTF-8" > /etc/locale.gen && \ - locale-gen - -# Needed by RStudio -RUN apt-get update -qq --yes && \ - apt-get install --yes --no-install-recommends -qq \ - psmisc \ - sudo \ - libapparmor1 \ - lsb-release \ - libclang-dev \ - libpq5 > /dev/null - -# Needed by many R libraries -# Picked up from https://github.com/rocker-org/rocker/blob/9dc3e458d4e92a8f41ccd75687cd7e316e657cc0/r-rspm/focal/Dockerfile -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - libgdal-dev \ - libgeos3.10.2 \ - libproj22 \ - libudunits2-0 \ - libxml2 > /dev/null - -# Install R. -# These packages must be installed into the base stage since they are in system -# paths rather than /srv. -# Pre-built R packages from rspm are built against system libs in jammy. -ENV R_VERSION=4.4.1-1.2204.0 -ENV LITTLER_VERSION=0.3.19-1.2204.0 -RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 -RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/" > /etc/apt/sources.list.d/cran.list -RUN curl --silent --location --fail https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc > /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc -RUN apt-get update -qq --yes > /dev/null && \ - apt-get install --yes -qq \ - r-base-core=${R_VERSION} \ - r-base-dev=${R_VERSION} \ - littler=${LITTLER_VERSION} \ - libglpk-dev \ - libzmq5 \ - nodejs npm > /dev/null - -ENV RSTUDIO_URL=https://download2.rstudio.org/server/jammy/amd64/rstudio-server-2024.04.2-764-amd64.deb -RUN curl --silent --location --fail ${RSTUDIO_URL} > /tmp/rstudio.deb && \ - apt install --no-install-recommends --yes /tmp/rstudio.deb && \ - rm /tmp/rstudio.deb - -# Install desktop packages -RUN apt-get update -qq --yes > /dev/null && \ - apt-get install --yes -qq \ - dbus-x11 \ - firefox \ - xfce4 \ - xfce4-panel \ - xfce4-terminal \ - xfce4-session \ - xfce4-settings \ - xorg \ - xubuntu-icon-theme > /dev/null - -# for nbconvert & notebook-to-pdf -RUN apt-get update -qq --yes && \ - apt-get install --yes -qq \ - pandoc \ - texlive-xetex \ - texlive-fonts-recommended \ - libx11-xcb1 \ - libxtst6 \ - libxrandr2 \ - libasound2 \ - libpangocairo-1.0-0 \ - libatk1.0-0 \ - libatk-bridge2.0-0 \ - libgtk-3-0 \ - libnss3 \ - libxss1 \ - > /dev/null - -# Adding ncompress,pbzip2 for issue #1885 BioE-131, Fall 2020 -RUN apt-get update -qq --yes > /dev/null && \ - apt-get install --yes -qq \ - ncompress \ - pbzip2 > /dev/null - -WORKDIR /home/jovyan - -# R_LIBS_USER is set by default in /etc/R/Renviron, which RStudio loads. -# We uncomment the default, and set what we wanna - so it picks up -# the packages we install. Without this, RStudio doesn't see the packages -# that R does. -# Stolen from https://github.com/jupyterhub/repo2docker/blob/6a07a48b2df48168685bb0f993d2a12bd86e23bf/repo2docker/buildpacks/r.py -RUN sed -i -e '/^R_LIBS_USER=/s/^/#/' /etc/R/Renviron && \ - echo "R_LIBS_USER=${R_LIBS_USER}" >> /etc/R/Renviron - -# Needed by Rhtslib -RUN apt-get update -qq --yes && \ - apt-get install --yes -qq \ - libcurl4-openssl-dev > /dev/null - -COPY install-mambaforge.bash /tmp/install-mambaforge.bash -RUN chmod 777 /tmp/install-mambaforge.bash -RUN /tmp/install-mambaforge.bash - -USER ${NB_USER} - -COPY environment.yml /tmp/ -COPY infra-requirements.txt /tmp/ - -RUN mamba env update -p ${CONDA_DIR} -f /tmp/environment.yml && \ - mamba clean -afy - -USER root -ENV PLAYWRIGHT_BROWSERS_PATH ${CONDA_DIR} -RUN playwright install-deps -RUN chown -Rh jovyan:jovyan /srv/conda - -USER ${NB_USER} - -# DH-333 -ENV PLAYWRIGHT_BROWSERS_PATH ${CONDA_DIR} -RUN playwright install chromium - -# 2024-01-13 sknapp: incompatible due to notebook 7 -# RUN jupyter contrib nbextensions install --sys-prefix --symlink && \ -# jupyter nbextensions_configurator enable --sys-prefix - -# Set CRAN mirror to rspm before we install anything -COPY Rprofile.site /usr/lib/R/etc/Rprofile.site -# RStudio needs its own config -COPY rsession.conf /etc/rstudio/rsession.conf -# Use simpler locking strategy -COPY file-locks /etc/rstudio/file-locks - -# Install IRKernel -RUN r -e "install.packages('IRkernel', version='1.2')" && \ - r -e "IRkernel::installspec(prefix='${CONDA_DIR}')" - -# Install R packages, cleanup temp package download location -COPY install.R /tmp/install.R -RUN r /tmp/install.R && \ - rm -rf /tmp/downloaded_packages/ /tmp/*.rds - -# install bio1b packages -COPY bio1b-packages.bash /tmp/bio1b-packages.bash -RUN bash /tmp/bio1b-packages.bash - -# install ib134L packages -COPY ib134-packages.bash /tmp/ib134-packages.bash -RUN bash /tmp/ib134-packages.bash - -# install ccb293 packages -COPY ccb293-packages.bash /tmp/ccb293-packages.bash -RUN bash /tmp/ccb293-packages.bash - -ENTRYPOINT ["tini", "--"] diff --git a/deployments/biology/image/README.md b/deployments/biology/image/README.md new file mode 100644 index 000000000..4588d74ef --- /dev/null +++ b/deployments/biology/image/README.md @@ -0,0 +1,5 @@ +# Biology Image + +This image is now located [in its own repo](https://github.com/berkeley-dsep-infra/biology-user-image). + +Please see [the contribution guide](https://github.com/berkeley-dsep-infra/biology-user-image/blob/main/CONTRIBUTING.md) for instructions on how to propose changes to the image. diff --git a/deployments/biology/image/Rprofile.site b/deployments/biology/image/Rprofile.site deleted file mode 100644 index 961f50b97..000000000 --- a/deployments/biology/image/Rprofile.site +++ /dev/null @@ -1,10 +0,0 @@ -# Use RStudio's CRAN mirror to get binary packages -# 'latest' just means it has all available versions. -# We can specify version numbers in devtools::install_version -options(repos = c(CRAN = "https://packagemanager.rstudio.com/all/__linux__/jammy/latest")) - -# RStudio's CRAN mirror needs this to figure out which binary package to serve. -# If not set properly, it will just serve up source packages -# Quite hilarious, IMO. -# See https://docs.rstudio.com/rspm/admin/binaries.html -options(HTTPUserAgent = sprintf("R/%s R (%s)", getRversion(), paste(getRversion(), R.version$platform, R.version$arch, R.version$os))) diff --git a/deployments/biology/image/bio1b-packages.bash b/deployments/biology/image/bio1b-packages.bash deleted file mode 100644 index d926eaddb..000000000 --- a/deployments/biology/image/bio1b-packages.bash +++ /dev/null @@ -1,11 +0,0 @@ -# Install PAUP* for BIO 1B -# https://github.com/berkeley-dsep-infra/datahub/issues/1699 - -# This package was requested in 2020 for the instructor to try out. -# The 168 version doesn't exist so I've bumped it to 169, but also disabled -# it in case the package is no longer needed. -return - -wget https://phylosolutions.com/paup-test/paup4a169_ubuntu64.gz -O ${CONDA_DIR}/bin/paup.gz -gunzip ${CONDA_DIR}/bin/paup.gz -chmod +x ${CONDA_DIR}/bin/paup diff --git a/deployments/biology/image/ccb293-packages.bash b/deployments/biology/image/ccb293-packages.bash deleted file mode 100644 index 8da8e511d..000000000 --- a/deployments/biology/image/ccb293-packages.bash +++ /dev/null @@ -1,5 +0,0 @@ -# Install QIIME2 for CCB293 -# https://github.com/berkeley-dsep-infra/datahub/issues/1699 -wget https://data.qiime2.org/distro/core/qiime2-2021.8-py38-linux-conda.yml -mamba env create -n qiime2 --file qiime2-2021.8-py38-linux-conda.yml && mamba clean -afy -rm qiime2-2021.8-py38-linux-conda.yml diff --git a/deployments/biology/image/environment.yml b/deployments/biology/image/environment.yml deleted file mode 100644 index cc4bee30c..000000000 --- a/deployments/biology/image/environment.yml +++ /dev/null @@ -1,82 +0,0 @@ -channels: -- bioconda -- conda-forge -dependencies: -- python=3.11.* -- pip=22.2.* - -# Package to allow Jupyter Notebook or JupyterLab applications in one conda env to access other kernels (e.g. qiime2) -- nb_conda_kernels=2.3.1 - -# proxy web applications -- jupyter-server-proxy==4.2.0 -- jupyter-rsession-proxy==2.0.1 - -# Packages from bioconda for IB134L -# - bwa=0.7.12 -- samtools=1.3.1 -- mafft=7.471 -- emboss=6.6.0 -- augustus=3.5.0 -- raxml-ng=1.0.1 -- structure=2.3.4 -- paml=4.9 -#- repeatmasker=4.0.9 -- trimmomatic=0.39 -- blast=2.12.0 -- fastqc=0.11.9 -- phyml=3.3.20200621 -- sra-tools=2.11 -# - hisat2=2.2.1 -# - subread=2.0.1 -- plink=1.90b6.21 - -- syncthing==1.18.6 - -# Packages for IB120/201/CCB210 -- sympy=1.12 - -# Packages from bioconda for BioE-131, Fall 2020, Issue #1885 -# - bowtie2=2.5.3 # commented out by sknapp 2024.05.07 doesn't support py3.11 -- spades=3.14.1 - -# Packages for MCB280A, Spring 2022 -- bedtools=2.30.0 - -# Packages from bioconda for BIO1B -- raxml=8.2.* -- muscle=3.8.* -- dendropy=4.4.* - -# pedagogy packages -- scipy=1.11. -- pandas=2.2.2 -- seaborn=0.11.2 - -# compbio BioE c146, Fall 22 and into the future, issue 3785 -- scikit-learn=1.2.2 - -# bug w/notebook and traitlets: https://github.com/jupyter/notebook/issues/7048 -- traitlets=5.9.* - -# for MCELLBI201B (FA24) https://github.com/berkeley-dsep-infra/datahub/issues/5988 -- macs2==2.2.9.1 - -# For https://github.com/berkeley-dsep-infra/datahub/issues/1846 -# Conda does not have these -- pip: - - -r infra-requirements.txt -# MCB280A, Spring 2022 - - pybedtools==0.10.0 -# removed for PR https://github.com/berkeley-dsep-infra/datahub/pull/4406 - macs2==2.2.7.1 - - geonomics==1.4.1 - - nlmpy==1.2.0 - - datascience==0.17.6 - -# Packages for MCB-160L iss #3942 - - allensdk==2.16.2 - - otter-grader==3.1.4 -# for exporting notebooks to pdf - - nbconvert[webpdf]==7.16.4 - - nb2pdf==0.6.2 - - nbpdfexport==0.2.1 diff --git a/deployments/biology/image/file-locks b/deployments/biology/image/file-locks deleted file mode 100644 index 7b1a3fcf4..000000000 --- a/deployments/biology/image/file-locks +++ /dev/null @@ -1,13 +0,0 @@ -# https://docs.rstudio.com/ide/server-pro/load_balancing/configuration.html#file-locking - -# rocker sets this to advisory, but this might be causing NFS issues. -# lets set it to the default (default: linkbased) -lock-type=linkbased - -# we'll also reduce the frequency by 1/3 -refresh-rate=60 -timeout-interval=90 - -# log attempts -# enable-logging=1 -# log-file=/tmp/rstudio-locking.log diff --git a/deployments/biology/image/ib134-packages.bash b/deployments/biology/image/ib134-packages.bash deleted file mode 100644 index be81fa2ce..000000000 --- a/deployments/biology/image/ib134-packages.bash +++ /dev/null @@ -1,32 +0,0 @@ -############################ -# Install packages for IB134L -############################ - -#LOCAL_BIN=${REPO_DIR}/.local/bin -#mkdir -p ${LOCAL_BIN} -# -## mitoZ installation -# -#wget https://raw.githubusercontent.com/linzhi2013/MitoZ/master/version_2.4-alpha/release_MitoZ_v2.4-alpha.tar.bz2 -O ${REPO_DIR}/release_MitoZ_v2.4-alpha.tar.bz2 -#pushd ${REPO_DIR} -#tar -jxvf release_MitoZ_v2.4-alpha.tar.bz2 -#rm release_MitoZ_v2.4-alpha.tar.bz2 -#cd release_MitoZ_v2.4-alpha -#wget https://raw.githubusercontent.com/linzhi2013/MitoZ/master/version_2.4-alpha/mitozEnv.yaml -#cd .. -# -### create mitoZ env -#conda env create -n mitozEnv -f release_MitoZ_v2.4-alpha/mitozEnv.yaml # worked after reinstallation of conda -# -### patch ncbiquery.py -#cp patches/ncbiquery.py /srv/conda/envs/mitozEnv/lib/python3.6/site-packages/ete3/ncbi_taxonomy/ncbiquery.py -# -### download annotations -##source activate mitozEnv -##python3 mitozEnv_config.py -##source deactivate - - -### - - diff --git a/deployments/biology/image/infra-requirements.txt b/deployments/biology/image/infra-requirements.txt deleted file mode 100644 index 0fb0bd930..000000000 --- a/deployments/biology/image/infra-requirements.txt +++ /dev/null @@ -1,29 +0,0 @@ -# WARNING: Original source at scripts/infra-packages/requirements.txt -# PLEASE DO NOT EDIT ELSEWHERE -# After editing scripts/infra-packages/requirements.txt, please run -# scripts/infra-packages/sync.bash. - -# This file pins versions of notebook related python packages we want -# across all hubs. This makes sure we don't need to upgrade them -# everwhere one by one. - -# FIXME: Freeze this to get exact versions of all dependencies -notebook==7.0.7 -jupyterlab==4.0.11 -nbgitpuller==1.2.1 -jupyter-resource-usage==1.0.1 -# Matches version in images/hub/Dockerfile -jupyterhub==4.1.6 -appmode==0.8.0 -ipywidgets==8.0.7 -jupyter-tree-download==1.0.1 -git-credential-helpers==0.2 -# Measure popularity of different packages in our hubs -# https://discourse.jupyter.org/t/request-for-implementation-instrument-libraries-actively-used-by-users-on-a-jupyterhub/7994?u=yuvipanda -git+https://github.com/shaneknapp/python-popularity-contest.git@add-error-handling -# RISE is useful for presentations - see https://github.com/berkeley-dsep-infra/datahub/issues/2527 -RISE==5.7.1 -# syncthing for dropbox-like functionality -jupyter-syncthing-proxy==1.0.3 -# jupyter archival tool for easy user downloads -jupyter-archive==3.4.0 diff --git a/deployments/biology/image/install-mambaforge.bash b/deployments/biology/image/install-mambaforge.bash deleted file mode 100644 index 47bf53f1d..000000000 --- a/deployments/biology/image/install-mambaforge.bash +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# This downloads and installs a pinned version of mambaforge -set -ex - -cd $(dirname $0) -MAMBAFORGE_VERSION='24.3.0-0' - -URL="https://github.com/conda-forge/miniforge/releases/download/${MAMBAFORGE_VERSION}/Mambaforge-${MAMBAFORGE_VERSION}-Linux-x86_64.sh" -INSTALLER_PATH=/tmp/mambaforge-installer.sh - -wget --quiet $URL -O ${INSTALLER_PATH} -chmod +x ${INSTALLER_PATH} - -bash ${INSTALLER_PATH} -b -p ${CONDA_DIR} -export PATH="${CONDA_DIR}/bin:$PATH" - -# Do not attempt to auto update conda or dependencies -conda config --system --set auto_update_conda false -conda config --system --set show_channel_urls true - -# empty conda history file, -# which seems to result in some effective pinning of packages in the initial env, -# which we don't intend. -# this file must not be *removed*, however -echo '' > ${CONDA_DIR}/conda-meta/history - -# Clean things out! -conda clean --all -f -y - -# Remove the big installer so we don't increase docker image size too much -rm ${INSTALLER_PATH} - -# Remove the pip cache created as part of installing mambaforge -rm -rf /root/.cache - -chown -R $NB_USER:$NB_USER ${CONDA_DIR} - -conda list -n root diff --git a/deployments/biology/image/install.R b/deployments/biology/image/install.R deleted file mode 100644 index 8e7e0cb82..000000000 --- a/deployments/biology/image/install.R +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env r - -# Install devtools, so we can install versioned packages -install.packages("devtools") - -# Install a bunch of R packages -# This doesn't do any dependency resolution or anything, -# so refer to `installed.packages()` for authoritative list -cran_packages <- c( - "tidyverse", "1.3.0", - "adegenet", "2.1.3", - "pegas", "0.14", - "phytools", "0.7-70", - "ape","5.4-1", - "seqinr","4.2-4", - "hierfstat","0.5-7", - "poppr","2.8.6", - "PopGenome","2.7.5", - "detectRUNS","0.9.6", - "pwr","1.3" , - "plotly","4.9.3", - "mixtools","1.2.0", - "mclust","5.4.7", - "pheatmap","1.0.12", - "phangorn","2.7.1", - "qqman","0.1.8" -) - -for (i in seq(1, length(cran_packages), 2)) { - devtools::install_version( - cran_packages[i], - version = cran_packages[i + 1] - ) -} - -## Bioconductor packages -if (!requireNamespace("BiocManager", quietly = TRUE)) - install.packages("BiocManager") - -BiocManager::install("EBSeq") -BiocManager::install("Rhtslib") -BiocManager::install("dada2") -BiocManager::install("phyloseq") -BiocManager::install("Biostrings") -BiocManager::install("cummeRbund") -BiocManager::install("DESeq2") -BiocManager::install("apeglm") -BiocManager::install('EnhancedVolcano') diff --git a/deployments/biology/image/mitozEnv_config.py b/deployments/biology/image/mitozEnv_config.py deleted file mode 100644 index a1c4da0a6..000000000 --- a/deployments/biology/image/mitozEnv_config.py +++ /dev/null @@ -1,3 +0,0 @@ -from ete3 import NCBITaxa -ncbi = NCBITaxa() -ncbi.update_taxonomy_database() diff --git a/deployments/biology/image/patches/ncbiquery.py b/deployments/biology/image/patches/ncbiquery.py deleted file mode 100644 index 5a2192d04..000000000 --- a/deployments/biology/image/patches/ncbiquery.py +++ /dev/null @@ -1,831 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# #START_LICENSE########################################################### -# -# -# This file is part of the Environment for Tree Exploration program -# (ETE). http://etetoolkit.org -# -# ETE is free software: you can redistribute it and/or modify it -# under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# ETE is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public -# License for more details. -# -# You should have received a copy of the GNU General Public License -# along with ETE. If not, see . -# -# -# ABOUT THE ETE PACKAGE -# ===================== -# -# ETE is distributed under the GPL copyleft license (2008-2015). -# -# If you make use of ETE in published work, please cite: -# -# Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon. -# ETE: a python Environment for Tree Exploration. Jaime BMC -# Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24 -# -# Note that extra references to the specific methods implemented in -# the toolkit may be available in the documentation. -# -# More info at http://etetoolkit.org. Contact: huerta@embl.de -# -# -# #END_LICENSE############################################################# - - -from __future__ import absolute_import -from __future__ import print_function -import sys -import os -try: - import cPickle as pickle -except ImportError: - # python 3 support - import pickle - -from collections import defaultdict, Counter - -import sqlite3 -import math -import tarfile -import six -from six.moves import map -import warnings - - -__all__ = ["NCBITaxa", "is_taxadb_up_to_date"] - -DB_VERSION = 2 -DEFAULT_TAXADB = os.path.join(os.environ.get('HOME', '/'), '.etetoolkit', 'taxa.sqlite') - - -def is_taxadb_up_to_date(dbfile=DEFAULT_TAXADB): - """Check if a valid and up-to-date taxa.sqlite database exists - - If dbfile= is not specified, DEFAULT_TAXADB is assumed - """ - db = sqlite3.connect(dbfile) - - try: - r = db.execute('SELECT version FROM stats;') - version = r.fetchone()[0] - except (sqlite3.OperationalError, ValueError, IndexError, TypeError): - version = None - - db.close() - - if version != DB_VERSION: - return False - - return True - - -class NCBITaxa(object): - """ - versionadded: 2.3 - - Provides a local transparent connector to the NCBI taxonomy database. - """ - - def __init__(self, dbfile=None, taxdump_file=None): - - if not dbfile: - self.dbfile = DEFAULT_TAXADB - else: - self.dbfile = dbfile - - if taxdump_file: - self.update_taxonomy_database(taxdump_file) - - if dbfile is None and not os.path.exists(self.dbfile): - print('NCBI database not present yet (first time used?)', file=sys.stderr) - self.update_taxonomy_database(taxdump_file) - - if not os.path.exists(self.dbfile): - raise ValueError("Cannot open taxonomy database: %s" % self.dbfile) - - self.db = None - self._connect() - - if not is_taxadb_up_to_date(self.dbfile): - print('NCBI database format is outdated. Upgrading', file=sys.stderr) - self.update_taxonomy_database(taxdump_file) - - def update_taxonomy_database(self, taxdump_file=None): - """Updates the ncbi taxonomy database by downloading and parsing the latest - taxdump.tar.gz file from the NCBI FTP site (via HTTP). - - :param None taxdump_file: an alternative location of the taxdump.tax.gz file. - """ - if not taxdump_file: - update_db(self.dbfile) - else: - update_db(self.dbfile, taxdump_file) - - def _connect(self): - self.db = sqlite3.connect(self.dbfile) - - def _translate_merged(self, all_taxids): - conv_all_taxids = set((list(map(int, all_taxids)))) - cmd = 'select taxid_old, taxid_new FROM merged WHERE taxid_old IN (%s)' %','.join(map(str, all_taxids)) - - result = self.db.execute(cmd) - conversion = {} - for old, new in result.fetchall(): - conv_all_taxids.discard(int(old)) - conv_all_taxids.add(int(new)) - conversion[int(old)] = int(new) - return conv_all_taxids, conversion - - - def get_fuzzy_name_translation(self, name, sim=0.9): - ''' - Given an inexact species name, returns the best match in the NCBI database of taxa names. - - :argument 0.9 sim: Min word similarity to report a match (from 0 to 1). - - :return: taxid, species-name-match, match-score - ''' - - - import sqlite3.dbapi2 as dbapi2 - _db = dbapi2.connect(self.dbfile) - _db.enable_load_extension(True) - module_path = os.path.split(os.path.realpath(__file__))[0] - _db.execute("select load_extension('%s')" % os.path.join(module_path, - "SQLite-Levenshtein/levenshtein.sqlext")) - - print("Trying fuzzy search for %s" % name) - maxdiffs = math.ceil(len(name) * (1-sim)) - cmd = 'SELECT taxid, spname, LEVENSHTEIN(spname, "%s") AS sim FROM species WHERE sim<=%s ORDER BY sim LIMIT 1;' % (name, maxdiffs) - taxid, spname, score = None, None, len(name) - result = _db.execute(cmd) - try: - taxid, spname, score = result.fetchone() - except TypeError: - cmd = 'SELECT taxid, spname, LEVENSHTEIN(spname, "%s") AS sim FROM synonym WHERE sim<=%s ORDER BY sim LIMIT 1;' % (name, maxdiffs) - result = _db.execute(cmd) - try: - taxid, spname, score = result.fetchone() - except Exception: - pass - else: - taxid = int(taxid) - else: - taxid = int(taxid) - - norm_score = 1 - (float(score)/len(name)) - if taxid: - print("FOUND! %s taxid:%s score:%s (%s)" %(spname, taxid, score, norm_score)) - - return taxid, spname, norm_score - - def get_rank(self, taxids): - 'return a dictionary converting a list of taxids into their corresponding NCBI taxonomy rank' - - all_ids = set(taxids) - all_ids.discard(None) - all_ids.discard("") - query = ','.join(['"%s"' %v for v in all_ids]) - cmd = "select taxid, rank FROM species WHERE taxid IN (%s);" %query - result = self.db.execute(cmd) - id2rank = {} - for tax, spname in result.fetchall(): - id2rank[tax] = spname - return id2rank - - def get_lineage_translator(self, taxids): - """Given a valid taxid number, return its corresponding lineage track as a - hierarchically sorted list of parent taxids. - """ - all_ids = set(taxids) - all_ids.discard(None) - all_ids.discard("") - query = ','.join(['"%s"' %v for v in all_ids]) - result = self.db.execute('SELECT taxid, track FROM species WHERE taxid IN (%s);' %query) - id2lineages = {} - for tax, track in result.fetchall(): - id2lineages[tax] = list(map(int, reversed(track.split(",")))) - - return id2lineages - - - def get_lineage(self, taxid): - """Given a valid taxid number, return its corresponding lineage track as a - hierarchically sorted list of parent taxids. - """ - if not taxid: - return None - result = self.db.execute('SELECT track FROM species WHERE taxid=%s' %taxid) - raw_track = result.fetchone() - if not raw_track: - #perhaps is an obsolete taxid - _, merged_conversion = self._translate_merged([taxid]) - if taxid in merged_conversion: - result = self.db.execute('SELECT track FROM species WHERE taxid=%s' %merged_conversion[taxid]) - raw_track = result.fetchone() - # if not raise error - if not raw_track: - #raw_track = ["1"] - raise ValueError("%s taxid not found" %taxid) - else: - warnings.warn("taxid %s was translated into %s" %(taxid, merged_conversion[taxid])) - - track = list(map(int, raw_track[0].split(","))) - return list(reversed(track)) - - def get_common_names(self, taxids): - query = ','.join(['"%s"' %v for v in taxids]) - cmd = "select taxid, common FROM species WHERE taxid IN (%s);" %query - result = self.db.execute(cmd) - id2name = {} - for tax, common_name in result.fetchall(): - if common_name: - id2name[tax] = common_name - return id2name - - def get_taxid_translator(self, taxids, try_synonyms=True): - """Given a list of taxids, returns a dictionary with their corresponding - scientific names. - """ - - all_ids = set(map(int, taxids)) - all_ids.discard(None) - all_ids.discard("") - query = ','.join(['"%s"' %v for v in all_ids]) - cmd = "select taxid, spname FROM species WHERE taxid IN (%s);" %query - result = self.db.execute(cmd) - id2name = {} - for tax, spname in result.fetchall(): - id2name[tax] = spname - - # any taxid without translation? lets tray in the merged table - if len(all_ids) != len(id2name) and try_synonyms: - not_found_taxids = all_ids - set(id2name.keys()) - taxids, old2new = self._translate_merged(not_found_taxids) - new2old = {v: k for k,v in six.iteritems(old2new)} - - if old2new: - query = ','.join(['"%s"' %v for v in new2old]) - cmd = "select taxid, spname FROM species WHERE taxid IN (%s);" %query - result = self.db.execute(cmd) - for tax, spname in result.fetchall(): - id2name[new2old[tax]] = spname - - return id2name - - def get_name_translator(self, names): - """ - Given a list of taxid scientific names, returns a dictionary translating them into their corresponding taxids. - - Exact name match is required for translation. - """ - - name2id = {} - #name2realname = {} - name2origname = {} - for n in names: - name2origname[n.lower()] = n - - names = set(name2origname.keys()) - - query = ','.join(['"%s"' %n for n in six.iterkeys(name2origname)]) - cmd = 'select spname, taxid from species where spname IN (%s)' %query - result = self.db.execute(cmd) - for sp, taxid in result.fetchall(): - oname = name2origname[sp.lower()] - name2id.setdefault(oname, []).append(taxid) - #name2realname[oname] = sp - missing = names - set([n.lower() for n in name2id.keys()]) - if missing: - query = ','.join(['"%s"' %n for n in missing]) - result = self.db.execute('select spname, taxid from synonym where spname IN (%s)' %query) - for sp, taxid in result.fetchall(): - oname = name2origname[sp.lower()] - name2id.setdefault(oname, []).append(taxid) - #name2realname[oname] = sp - return name2id - - def translate_to_names(self, taxids): - """ - Given a list of taxid numbers, returns another list with their corresponding scientific names. - """ - id2name = self.get_taxid_translator(taxids) - names = [] - for sp in taxids: - names.append(id2name.get(sp, sp)) - return names - - - def get_descendant_taxa(self, parent, intermediate_nodes=False, rank_limit=None, collapse_subspecies=False, return_tree=False): - """ - given a parent taxid or scientific species name, returns a list of all its descendants taxids. - If intermediate_nodes is set to True, internal nodes will also be dumped. - - """ - try: - taxid = int(parent) - except ValueError: - try: - taxid = self.get_name_translator([parent])[parent][0] - except KeyError: - raise ValueError('%s not found!' %parent) - - with open(self.dbfile+".traverse.pkl", "rb") as CACHED_TRAVERSE: - prepostorder = pickle.load(CACHED_TRAVERSE) - descendants = {} - found = 0 - for tid in prepostorder: - if tid == taxid: - found += 1 - elif found == 1: - descendants[tid] = descendants.get(tid, 0) + 1 - elif found == 2: - break - - if not found: - raise ValueError("taxid not found:%s" %taxid) - elif found == 1: - return [taxid] - - if rank_limit or collapse_subspecies or return_tree: - tree = self.get_topology(list(descendants.keys()), intermediate_nodes=intermediate_nodes, collapse_subspecies=collapse_subspecies, rank_limit=rank_limit) - if return_tree: - return tree - elif intermediate_nodes: - return list(map(int, [n.name for n in tree.get_descendants()])) - else: - return map(int, [n.name for n in tree]) - - elif intermediate_nodes: - return [tid for tid, count in six.iteritems(descendants)] - else: - return [tid for tid, count in six.iteritems(descendants) if count == 1] - - def get_topology(self, taxids, intermediate_nodes=False, rank_limit=None, collapse_subspecies=False, annotate=True): - """Given a list of taxid numbers, return the minimal pruned NCBI taxonomy tree - containing all of them. - - :param False intermediate_nodes: If True, single child nodes - representing the complete lineage of leaf nodes are kept. - Otherwise, the tree is pruned to contain the first common - ancestor of each group. - - :param None rank_limit: If valid NCBI rank name is provided, - the tree is pruned at that given level. For instance, use - rank="species" to get rid of sub-species or strain leaf - nodes. - - :param False collapse_subspecies: If True, any item under the - species rank will be collapsed into the species upper - node. - - """ - from .. import PhyloTree - taxids, merged_conversion = self._translate_merged(taxids) - if len(taxids) == 1: - root_taxid = int(list(taxids)[0]) - with open(self.dbfile+".traverse.pkl", "rb") as CACHED_TRAVERSE: - prepostorder = pickle.load(CACHED_TRAVERSE) - nodes = {} - visited = set() - start = prepostorder.index(root_taxid) - try: - end = prepostorder.index(root_taxid, start+1) - subtree = prepostorder[start:end+1] - except ValueError: - # If root taxid is not found in postorder, must be a tip node - subtree = [root_taxid] - - leaves = set([v for v, count in Counter(subtree).items() if count == 1]) - nodes[root_taxid] = PhyloTree(name=str(root_taxid)) - current_parent = nodes[root_taxid] - for tid in subtree: - if tid in visited: - current_parent = nodes[tid].up - else: - visited.add(tid) - nodes[tid] = PhyloTree(name=str(tid)) - current_parent.add_child(nodes[tid]) - if tid not in leaves: - current_parent = nodes[tid] - root = nodes[root_taxid] - else: - taxids = set(map(int, taxids)) - sp2track = {} - elem2node = {} - id2lineage = self.get_lineage_translator(taxids) - all_taxids = set() - for lineage in id2lineage.values(): - all_taxids.update(lineage) - id2rank = self.get_rank(all_taxids) - for sp in taxids: - track = [] - lineage = id2lineage[sp] - - for elem in lineage: - if elem not in elem2node: - node = elem2node.setdefault(elem, PhyloTree()) - node.name = str(elem) - node.taxid = elem - node.add_feature("rank", str(id2rank.get(int(elem), "no rank"))) - else: - node = elem2node[elem] - track.append(node) - sp2track[sp] = track - # generate parent child relationships - for sp, track in six.iteritems(sp2track): - parent = None - for elem in track: - if parent and elem not in parent.children: - parent.add_child(elem) - if rank_limit and elem.rank == rank_limit: - break - parent = elem - root = elem2node[1] - - #remove onechild-nodes - if not intermediate_nodes: - for n in root.get_descendants(): - if len(n.children) == 1 and int(n.name) not in taxids: - n.delete(prevent_nondicotomic=False) - - if len(root.children) == 1: - tree = root.children[0].detach() - else: - tree = root - - if collapse_subspecies: - to_detach = [] - for node in tree.traverse(): - if node.rank == "species": - to_detach.extend(node.children) - for n in to_detach: - n.detach() - - if annotate: - self.annotate_tree(tree) - - return tree - - - def annotate_tree(self, t, taxid_attr="name", tax2name=None, tax2track=None, tax2rank=None): - """Annotate a tree containing taxids as leaf names by adding the 'taxid', - 'sci_name', 'lineage', 'named_lineage' and 'rank' additional attributes. - - :param t: a Tree (or Tree derived) instance. - - :param name taxid_attr: Allows to set a custom node attribute - containing the taxid number associated to each node (i.e. - species in PhyloTree instances). - - :param tax2name,tax2track,tax2rank: Use these arguments to - provide pre-calculated dictionaries providing translation - from taxid number and names,track lineages and ranks. - """ - - taxids = set() - for n in t.traverse(): - try: - tid = int(getattr(n, taxid_attr)) - except (ValueError,AttributeError): - pass - else: - taxids.add(tid) - merged_conversion = {} - - taxids, merged_conversion = self._translate_merged(taxids) - - if not tax2name or taxids - set(map(int, list(tax2name.keys()))): - tax2name = self.get_taxid_translator(taxids) - if not tax2track or taxids - set(map(int, list(tax2track.keys()))): - tax2track = self.get_lineage_translator(taxids) - - all_taxid_codes = set([_tax for _lin in list(tax2track.values()) for _tax in _lin]) - extra_tax2name = self.get_taxid_translator(list(all_taxid_codes - set(tax2name.keys()))) - tax2name.update(extra_tax2name) - - tax2common_name = self.get_common_names(tax2name.keys()) - - if not tax2rank: - tax2rank = self.get_rank(list(tax2name.keys())) - - n2leaves = t.get_cached_content() - - for n in t.traverse('postorder'): - try: - node_taxid = int(getattr(n, taxid_attr)) - except (ValueError, AttributeError): - node_taxid = None - - n.add_features(taxid = node_taxid) - if node_taxid: - if node_taxid in merged_conversion: - node_taxid = merged_conversion[node_taxid] - n.add_features(sci_name = tax2name.get(node_taxid, getattr(n, taxid_attr, '')), - common_name = tax2common_name.get(node_taxid, ''), - lineage = tax2track[node_taxid], - rank = tax2rank.get(node_taxid, 'Unknown'), - named_lineage = [tax2name.get(tax, str(tax)) for tax in tax2track[node_taxid]]) - elif n.is_leaf(): - n.add_features(sci_name = getattr(n, taxid_attr, 'NA'), - common_name = '', - lineage = [], - rank = 'Unknown', - named_lineage = []) - else: - lineage = self._common_lineage([lf.lineage for lf in n2leaves[n]]) - ancestor = lineage[-1] - n.add_features(sci_name = tax2name.get(ancestor, str(ancestor)), - common_name = tax2common_name.get(ancestor, ''), - taxid = ancestor, - lineage = lineage, - rank = tax2rank.get(ancestor, 'Unknown'), - named_lineage = [tax2name.get(tax, str(tax)) for tax in lineage]) - - return tax2name, tax2track, tax2rank - - def _common_lineage(self, vectors): - occurrence = defaultdict(int) - pos = defaultdict(set) - for v in vectors: - for i, taxid in enumerate(v): - occurrence[taxid] += 1 - pos[taxid].add(i) - - common = [taxid for taxid, ocu in six.iteritems(occurrence) if ocu == len(vectors)] - if not common: - return [""] - else: - sorted_lineage = sorted(common, key=lambda x: min(pos[x])) - return sorted_lineage - - # OLD APPROACH: - - # visited = defaultdict(int) - # for index, name in [(ei, e) for v in vectors for ei, e in enumerate(v)]: - # visited[(name, index)] += 1 - - # def _sort(a, b): - # if a[1] > b[1]: - # return 1 - # elif a[1] < b[1]: - # return -1 - # else: - # if a[0][1] > b[0][1]: - # return 1 - # elif a[0][1] < b[0][1]: - # return -1 - # return 0 - - # matches = sorted(visited.items(), _sort) - - # if matches: - # best_match = matches[-1] - # else: - # return "", set() - - # if best_match[1] != len(vectors): - # return "", set() - # else: - # return best_match[0][0], [m[0][0] for m in matches if m[1] == len(vectors)] - - - def get_broken_branches(self, t, taxa_lineages, n2content=None): - """Returns a list of NCBI lineage names that are not monophyletic in the - provided tree, as well as the list of affected branches and their size. - - CURRENTLY EXPERIMENTAL - - """ - if not n2content: - n2content = t.get_cached_content() - - tax2node = defaultdict(set) - - unknown = set() - for leaf in t.iter_leaves(): - if leaf.sci_name.lower() != "unknown": - lineage = taxa_lineages[leaf.taxid] - for index, tax in enumerate(lineage): - tax2node[tax].add(leaf) - else: - unknown.add(leaf) - - broken_branches = defaultdict(set) - broken_clades = set() - for tax, leaves in six.iteritems(tax2node): - if len(leaves) > 1: - common = t.get_common_ancestor(leaves) - else: - common = list(leaves)[0] - if (leaves ^ set(n2content[common])) - unknown: - broken_branches[common].add(tax) - broken_clades.add(tax) - - broken_clade_sizes = [len(tax2node[tax]) for tax in broken_clades] - return broken_branches, broken_clades, broken_clade_sizes - - - # def annotate_tree_with_taxa(self, t, name2taxa_file, tax2name=None, tax2track=None, attr_name="name"): - # if name2taxa_file: - # names2taxid = dict([map(strip, line.split("\t")) - # for line in open(name2taxa_file)]) - # else: - # names2taxid = dict([(n.name, getattr(n, attr_name)) for n in t.iter_leaves()]) - - # not_found = 0 - # for n in t.iter_leaves(): - # n.add_features(taxid=names2taxid.get(n.name, 0)) - # n.add_features(species=n.taxid) - # if n.taxid == 0: - # not_found += 1 - # if not_found: - # print >>sys.stderr, "WARNING: %s nodes where not found within NCBI taxonomy!!" %not_found - - # return self.annotate_tree(t, tax2name, tax2track, attr_name="taxid") - - -def load_ncbi_tree_from_dump(tar): - from .. import Tree - # Download: http://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz - parent2child = {} - name2node = {} - node2taxname = {} - synonyms = set() - node2common = {} - print("Loading node names...") - for line in tar.extractfile("names.dmp"): - line = str(line.decode()) - fields = [_f.strip() for _f in line.split("|")] - nodename = fields[0] - name_type = fields[3].lower() - taxname = fields[1] - if name_type == "scientific name": - node2taxname[nodename] = taxname - if name_type == "genbank common name": - node2common[nodename] = taxname - elif name_type in set(["synonym", "equivalent name", "genbank equivalent name", - "anamorph", "genbank synonym", "genbank anamorph", "teleomorph"]): - synonyms.add( (nodename, taxname) ) - print(len(node2taxname), "names loaded.") - print(len(synonyms), "synonyms loaded.") - - print("Loading nodes...") - for line in tar.extractfile("nodes.dmp"): - line = str(line.decode()) - fields = line.split("|") - nodename = fields[0].strip() - parentname = fields[1].strip() - n = Tree() - n.name = nodename - n.taxname = node2taxname[nodename] - if nodename in node2common: - n.common_name = node2common[nodename] - n.rank = fields[2].strip() - parent2child[nodename] = parentname - name2node[nodename] = n - print(len(name2node), "nodes loaded.") - - print("Linking nodes...") - for node in name2node: - if node == "1": - t = name2node[node] - else: - parent = parent2child[node] - parent_node = name2node[parent] - parent_node.add_child(name2node[node]) - print("Tree is loaded.") - return t, synonyms - -def generate_table(t): - OUT = open("taxa.tab", "w") - for j, n in enumerate(t.traverse()): - if j%1000 == 0: - print("\r",j,"generating entries...", end=' ') - temp_node = n - track = [] - while temp_node: - track.append(temp_node.name) - temp_node = temp_node.up - if n.up: - print('\t'.join([n.name, n.up.name, n.taxname, getattr(n, "common_name", ""), n.rank, ','.join(track)]), file=OUT) - else: - print('\t'.join([n.name, "", n.taxname, getattr(n, "common_name", ""), n.rank, ','.join(track)]), file=OUT) - OUT.close() - -def update_db(dbfile, targz_file=None): - basepath = os.path.split(dbfile)[0] - if basepath and not os.path.exists(basepath): - os.mkdir(basepath) - - if not targz_file: - try: - from urllib import urlretrieve - except ImportError: - from urllib.request import urlretrieve - - print('Downloading taxdump.tar.gz from NCBI FTP site (via HTTP)...', file=sys.stderr) - urlretrieve("http://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz", "taxdump.tar.gz") - print('Done. Parsing...', file=sys.stderr) - targz_file = "taxdump.tar.gz" - - tar = tarfile.open(targz_file, 'r') - t, synonyms = load_ncbi_tree_from_dump(tar) - prepostorder = [int(node.name) for post, node in t.iter_prepostorder()] - pickle.dump(prepostorder, open(dbfile+'.traverse.pkl', "wb"), 2) - - print("Updating database: %s ..." %dbfile) - generate_table(t) - - open("syn.tab", "w").write('\n'.join(["%s\t%s" %(v[0],v[1]) for v in synonyms])) - - with open("merged.tab", "w") as merged: - for line in tar.extractfile("merged.dmp"): - line = str(line.decode()) - out_line = '\t'.join([_f.strip() for _f in line.split('|')[:2]]) - merged.write(out_line+'\n') - try: - upload_data(dbfile) - except: - raise - else: - os.system("rm syn.tab merged.tab taxa.tab") - # remove only downloaded taxdump file - if not targz_file: - os.system("rm taxdump.tar.gz") - -def upload_data(dbfile): - print() - print('Uploading to', dbfile) - basepath = os.path.split(dbfile)[0] - if basepath and not os.path.exists(basepath): - os.mkdir(basepath) - - db = sqlite3.connect(dbfile) - - create_cmd = """ - DROP TABLE IF EXISTS stats; - DROP TABLE IF EXISTS species; - DROP TABLE IF EXISTS synonym; - DROP TABLE IF EXISTS merged; - CREATE TABLE stats (version INT PRIMARY KEY); - CREATE TABLE species (taxid INT PRIMARY KEY, parent INT, spname VARCHAR(50) COLLATE NOCASE, common VARCHAR(50) COLLATE NOCASE, rank VARCHAR(50), track TEXT); - CREATE TABLE synonym (taxid INT,spname VARCHAR(50), PRIMARY KEY (spname, taxid)); - CREATE TABLE merged (taxid_old INT, taxid_new INT); - CREATE INDEX spname1 ON species (spname COLLATE NOCASE); - CREATE INDEX spname2 ON synonym (spname COLLATE NOCASE); - """ - for cmd in create_cmd.split(';'): - db.execute(cmd) - print() - - db.execute("INSERT INTO stats (version) VALUES (%d);" %DB_VERSION) - db.commit() - - for i, line in enumerate(open("syn.tab")): - if i%5000 == 0 : - print('\rInserting synonyms: % 6d' %i, end=' ', file=sys.stderr) - sys.stderr.flush() - taxid, spname = line.strip('\n').split('\t') - db.execute("INSERT INTO synonym (taxid, spname) VALUES (?, ?);", (taxid, spname)) - print() - db.commit() - for i, line in enumerate(open("merged.tab")): - if i%5000 == 0 : - print('\rInserting taxid merges: % 6d' %i, end=' ', file=sys.stderr) - sys.stderr.flush() - taxid_old, taxid_new = line.strip('\n').split('\t') - db.execute("INSERT INTO merged (taxid_old, taxid_new) VALUES (?, ?);", (taxid_old, taxid_new)) - print() - db.commit() - for i, line in enumerate(open("taxa.tab")): - if i%5000 == 0 : - print('\rInserting taxids: % 6d' %i, end=' ', file=sys.stderr) - sys.stderr.flush() - taxid, parentid, spname, common, rank, lineage = line.strip('\n').split('\t') - db.execute("INSERT INTO species (taxid, parent, spname, common, rank, track) VALUES (?, ?, ?, ?, ?, ?);", (taxid, parentid, spname, common, rank, lineage)) - print() - db.commit() - -if __name__ == "__main__": - ncbi = NCBITaxa() - - a = ncbi.get_descendant_taxa("hominidae") - print(a) - print(ncbi.get_common_names(a)) - print(ncbi.get_topology(a)) - b = ncbi.get_descendant_taxa("homo", intermediate_nodes=True, collapse_subspecies=True) - print(ncbi.get_taxid_translator(b)) - - print(ncbi.get_common_names(b)) - #ncbi.update_taxonomy_database() - diff --git a/deployments/biology/image/rsession.conf b/deployments/biology/image/rsession.conf deleted file mode 100644 index 1f82b6b54..000000000 --- a/deployments/biology/image/rsession.conf +++ /dev/null @@ -1,2 +0,0 @@ -# Use binary packages! -r-cran-repos=https://packagemanager.rstudio.com/all/__linux__/jammy/latest diff --git a/deployments/cee/hubploy.yaml b/deployments/cee/hubploy.yaml index ae88ce005..7ad9e5d0a 100644 --- a/deployments/cee/hubploy.yaml +++ b/deployments/cee/hubploy.yaml @@ -1,12 +1,6 @@ images: images: - - name: us-central1-docker.pkg.dev/ucb-datahub-2018/user-images/cee-image - path: image/ - registry: - provider: gcloud - gcloud: - project: ucb-datahub-2018 - service_key: gcr-key.json + - name: us-central1-docker.pkg.dev/ucb-datahub-2018/user-images/cee-image:placeholder cluster: provider: gcloud diff --git a/deployments/cee/image/README.md b/deployments/cee/image/README.md new file mode 100644 index 000000000..22da95223 --- /dev/null +++ b/deployments/cee/image/README.md @@ -0,0 +1,5 @@ +# Cee Image + +This image is now located [in its own repo](https://github.com/berkeley-dsep-infra/cee-user-image). + +Please see [the contribution guide](https://github.com/berkeley-dsep-infra/cee-user-image/blob/main/CONTRIBUTING.md) for instructions on how to propose changes to the image. diff --git a/deployments/cee/image/apt.txt b/deployments/cee/image/apt.txt deleted file mode 100644 index fdb6e7bd3..000000000 --- a/deployments/cee/image/apt.txt +++ /dev/null @@ -1,36 +0,0 @@ -# installing less as more just isn't enough -less - -# For desktop environment -dbus-x11 -xfce4 -xfce4-panel -xfce4-session -xfce4-settings -xubuntu-icon-theme -xauth -# You definitely need a browser -firefox -# And a text editor -gedit -# And a terminal -xfce4-terminal - -# https://github.com/berkeley-dsep-infra/datahub/issues/2535 -emacs - -# vim4ever! -vim - -# https://github.com/berkeley-dsep-infra/datahub/issues/3679 -texlive-xetex -# https://github.com/berkeley-dsep-infra/datahub/issues/3721 -texlive-fonts-recommended -texlive-plain-generic -texlive-lang-chinese - -# Needed for qgis maybe? https://github.com/conda-forge/pygridgen-feedstock/issues/10 -libgl1-mesa-glx - -# For jupyter-tree-download. Ref: https://github.com/berkeley-dsep-infra/datahub/issues/3979 -zip diff --git a/deployments/cee/image/environment.yml b/deployments/cee/image/environment.yml deleted file mode 100644 index 7ce6957e0..000000000 --- a/deployments/cee/image/environment.yml +++ /dev/null @@ -1,31 +0,0 @@ -channels: -- conda-forge - -# Almost all libraries should be added in requirements.txt -# Only libraries *not* available in PyPI should be here -dependencies: -- python=3.11.* -- jupyter-server-proxy==4.2.0 -#adding math functionality -- matplotlib=3.7.* -- scipy=1.10.* -- mpld3=0.5.* - -# Needed for linux desktop environment -- websockify=0.11.0 - -- qgis=3.22.* - -# Maybe needed for qgis? https://github.com/conda-forge/qgis-feedstock/issues/263 -- pyopencl=2023.1 - -- pip -- pip: -# For desktop environment - - jupyter-desktop-server==0.1.3 - - -r infra-requirements.txt - - otter-grader==3.1.4 - # for notebook exporting - - nbconvert==7.6.0 - - nb2pdf==0.6.2 - - nbpdfexport==0.2.1 diff --git a/deployments/cee/image/infra-requirements.txt b/deployments/cee/image/infra-requirements.txt deleted file mode 100644 index 0fb0bd930..000000000 --- a/deployments/cee/image/infra-requirements.txt +++ /dev/null @@ -1,29 +0,0 @@ -# WARNING: Original source at scripts/infra-packages/requirements.txt -# PLEASE DO NOT EDIT ELSEWHERE -# After editing scripts/infra-packages/requirements.txt, please run -# scripts/infra-packages/sync.bash. - -# This file pins versions of notebook related python packages we want -# across all hubs. This makes sure we don't need to upgrade them -# everwhere one by one. - -# FIXME: Freeze this to get exact versions of all dependencies -notebook==7.0.7 -jupyterlab==4.0.11 -nbgitpuller==1.2.1 -jupyter-resource-usage==1.0.1 -# Matches version in images/hub/Dockerfile -jupyterhub==4.1.6 -appmode==0.8.0 -ipywidgets==8.0.7 -jupyter-tree-download==1.0.1 -git-credential-helpers==0.2 -# Measure popularity of different packages in our hubs -# https://discourse.jupyter.org/t/request-for-implementation-instrument-libraries-actively-used-by-users-on-a-jupyterhub/7994?u=yuvipanda -git+https://github.com/shaneknapp/python-popularity-contest.git@add-error-handling -# RISE is useful for presentations - see https://github.com/berkeley-dsep-infra/datahub/issues/2527 -RISE==5.7.1 -# syncthing for dropbox-like functionality -jupyter-syncthing-proxy==1.0.3 -# jupyter archival tool for easy user downloads -jupyter-archive==3.4.0 diff --git a/deployments/cee/image/postBuild b/deployments/cee/image/postBuild deleted file mode 100755 index 0b401ab73..000000000 --- a/deployments/cee/image/postBuild +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -set -eou pipefail - -# HORRIBLE HACK to get qgis working. conda-forge brings in libgsl 2.7, but this seems -# to require libgsl 2.5? And I can't force conda to install that. This might break something -# deep somewhere. See https://github.com/conda-forge/qgis-feedstock/issues/263 -# for upstream discussion -#ln -s /srv/conda/envs/notebook/lib/libgsl.so.27 /srv/conda/envs/notebook/lib/libgsl.so.25 diff --git a/deployments/cee/image/qgis.desktop b/deployments/cee/image/qgis.desktop deleted file mode 100755 index 3c5efdd84..000000000 --- a/deployments/cee/image/qgis.desktop +++ /dev/null @@ -1,6 +0,0 @@ -[Desktop Entry] -Version=1.0 -Type=Application -Name=qgis -Exec=qgis -Icon=/srv/conda/envs/notebook/share/qgis/images/icons/qgis-icon-512x512.png \ No newline at end of file diff --git a/deployments/cee/image/start b/deployments/cee/image/start deleted file mode 100755 index 1ff95e436..000000000 --- a/deployments/cee/image/start +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -set -euo pipefail - -mkdir -p $HOME/Desktop - -cp $REPO_DIR/*.desktop $HOME/Desktop/ - -exec "$@" \ No newline at end of file