diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000..aeac7333 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,18 @@ +version: 2 +updates: + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "weekly" + groups: + actions: + patterns: + - "*" + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" + groups: + actions: + patterns: + - "*" diff --git a/.github/workflows/build_app_runner.yml b/.github/workflows/build_app_runner.yml new file mode 100644 index 00000000..ce73ba00 --- /dev/null +++ b/.github/workflows/build_app_runner.yml @@ -0,0 +1,51 @@ +name: Build App Runner +on: + workflow_dispatch: + release: + types: [published] + tags: + - "app-runner/*" +jobs: + build_app_runner: + name: Build App Runner + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Build + run: | + bash app_runner/deploy/build.sh build-output app_runner + ls -l build-output + - name: Verify + run: | + build-output/app_runner/app_runner --help + - name: Upload + uses: actions/upload-artifact@v4 + with: + name: app_runner_linux_x86_64 + path: build-output + publish_release_artifact: + name: Publish Release Artifact + runs-on: ubuntu-latest + needs: build_app_runner + if: github.event_name == 'release' + steps: + - name: Download Artifact + uses: actions/download-artifact@v4 + with: + name: app_runner_linux_x86_64 + path: app_runner_linux_x86_64 + - name: Zip for release + run: | + cd app_runner_linux_x86_64 + zip -r app_runner_linux_x86_64.zip * + ls -l + - name: Print folder structure + run: | + tree + - name: Upload Release Assets + uses: softprops/action-gh-release@v2 + with: + files: | + app_runner_linux_x86_64/app_runner_linux_x86_64.zip + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/run_unit_tests.yml b/.github/workflows/run_unit_tests.yml index 01e61638..8b118217 100644 --- a/.github/workflows/run_unit_tests.yml +++ b/.github/workflows/run_unit_tests.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v5 with: python-version: 3.9 - name: Install nox diff --git a/.gitignore b/.gitignore index 42414e62..39d119ec 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,8 @@ .idea/ __pycache__ -bfabric.egg-info/ +*.egg-info/ bfabric/scripts/query_result.txt build/ dist/ site/ +_build/ diff --git a/README.md b/README.md index 2d303871..7c901644 100644 --- a/README.md +++ b/README.md @@ -1,271 +1,28 @@ -![unitTests](https://github.com/fgcz/bfabricPy/workflows/unit%20tests/badge.svg) -[![EDBT'10](https://img.shields.io/badge/EDBT-10.1145%2F1739041.1739135-brightgreen)](https://doi.org/10.1145/1739041.1739135) -[![JIB](https://img.shields.io/badge/JIB-10.1515%2Fjib.2022.0031-brightgreen)](https://doi.org/10.1515/jib-2022-0031) - # bfabricPy -This package connects the [bfabric](https://fgcz-bfabric.uzh.ch/bfabric/) system to the [python](https://www.python.org/) and [R](https://cran.r-project.org/) world while providing a JSON and REST interface using [Flask](https://www.fullstackpython.com). -The [bfabricShiny](https://github.com/cpanse/bfabricShiny) R package is an extension and provides code snippets and sample implementation for a seamless R shiny bfabric integration. -For more advanced users the *bfabricPy* package also provides a powerful query interface on the command-line though using the provided scripts. - -You can find the up-to-date documentation at [https://fgcz.github.io/bfabricPy](https://fgcz.github.io/bfabricPy). - -## CheatSheet - -### Read - -```{bash} -bfabric_read.py storage -bfabric_read.py application -``` - -Simple database query examples - -```{bash} -bfabric_read.py user login cpanse -bfabric_read.py project id 3000 -bfabric_read.py workunit id 199387 -bfabric_read.py sample name autoQC4L -bfabric_read.py workunit status processing -bfabric_read.py workunit status pending -bfabric_read.py workunit status failed - -# list empty resources -bfabric_read.py resource filechecksum d41d8cd98f00b204e9800998ecf8427e -``` - -Using the Python API: - -```{py} -from bfabric import Bfabric - -client = Bfabric.from_config() - -user = client.read(endpoint = 'user', obj={'login': 'cpanse'}) -resource = client.read(endpoint = 'resource', obj={'id': 550327 }) -``` - -### save - -```{bash} -bfabric_save_workunit_attribute.py 199387 status available -``` - -```{python} -import json -rv = client.save('workunit', {'id': 254063, 'status': 'available'}) -print(json.dumps(rv.to_list_dict(), indent=2)) -``` - -### Command line code snippet - -Find empty resource files in bfabric - -```{bash} -bfabric_read.py resource filechecksum `md5sum < /dev/null | cut -c-32` \ - | cat -n \ - | tail -``` - -## Examples \[outdated\] - -### bash script generated by the yaml wrapper creator / submitter - -externaljobid-45939_executableid-15312.bash listing: - -```bash -#!/bin/bash -# -# $HeadURL: http://fgcz-svn.uzh.ch/repos/scripts/trunk/linux/bfabric/apps/python/README.md $ -# $Id: README.md 2535 2016-10-24 08:49:17Z cpanse $ -# Christian Panse 2007-2015 - -# Grid Engine Parameters -#$ -q PRX@fgcz-c-071 -#$ -e /home/bfabric/sgeworker/logs/workunitid-134923_resourceid-203236.err -#$ -o /home/bfabric/sgeworker/logs/workunitid-134923_resourceid-203236.out - - -set -e -set -o pipefail - -export EXTERNALJOBID=45938 -export RESSOURCEID_OUTPUT=203238 -export RESSOURCEID_STDOUT_STDERR="203237 203238" -export OUTPUT="bfabric@fgczdata.fgcz-net.unizh.ch:/srv/www/htdocs//p1000/bfabric/Proteomics/gerneric_yaml/2015/2015-09/2015-09-02//workunit_134923//203236.zip" - -# job configuration set by B-Fabrics wrapper_creator executable -_OUTPUT=`echo $OUTPUT | cut -d"," -f1` -test $? -eq 0 && _OUTPUTHOST=`echo $_OUTPUT | cut -d":" -f1` -test $? -eq 0 && _OUTPUTPATH=`echo $_OUTPUT | cut -d":" -f2` -test $? -eq 0 && _OUTPUTPATH=`dirname $_OUTPUTPATH` -test $? -eq 0 && ssh $_OUTPUTHOST "mkdir -p $_OUTPUTPATH" - -if [ $? -eq 1 ]; -then - echo "writting to output url failed!"; - exit 1; -fi - -cat > /tmp/yaml_config.$$ < - - - - - XXX - XXX - - 482 - - - - - ' -} - -for url in https://fgcz-bfabric.uzh.ch/bfabric/user?wsdl https://fgcz-bfabric-test.uzh.ch/bfabric/user?wsdl; -do - echo - echo "==== ${url} === " - query ${url} -done - -echo $? -``` - -### Example usage - -remove accidentally inserted mgf files - -``` -bfabric_read.py importresource \ - | grep mgf$ \ - | awk '{print $1}' \ - | tee /tmp/$$.log \ - | while read i; - do - bfabric_delete.py importresource $i ; - done -``` - -## Send an E-mail \[outdated\] - -``` -# by CT,CP -# not implemented yet 2022-10-19 , -rv = B.save_object(endpoint = 'mail', - obj={'subject': "TEST", - 'recipientemail': 'bfabrictest482.cp@fgcz.ethz.ch', - 'message': "TEST; ignore that email", - 'parentId': 482, - 'parentClassName': 'user'}) -# shown as mail for user id 482 -``` - -## See also - -- [bfabric documentation](https://fgcz-bfabric.uzh.ch/wiki/HomePage) -- [FAQ](faq.md) -- [wsdl4BFabric](http://fgcz-intranet.uzh.ch/tiki-index.php?page=wsdl4BFabric) wiki page -- WSDL Interface to B-Fabric [endpoints](http://fgcz-bfabric.uzh.ch/bfabric/workunit?wsdl) - -## FAQ - -### How to resolve ` create `inputs.yml` files and 1 `chunks.yml` file +- process -> process a particular chunk (after inputs have been prepared) +- collect -> collect the results of a chunk and create `outputs.yml` files + +The individual app can be in a container environment or a script running in the same environment as the app runner. + +To make this possible input and output staging is abstracted and communicated through `inputs.yml` and `outputs.yml` +specification files. +A command is available to stage the inputs or register the outputs respectively then. diff --git a/app_runner/deploy/build.sh b/app_runner/deploy/build.sh new file mode 100644 index 00000000..4df17b1c --- /dev/null +++ b/app_runner/deploy/build.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set -euxo pipefail +# Parse arguments +TARGET_DIR=$(readlink -f "${1:-./dist}") +TARGET_NAME="${2:-app_runner}" +DOCKER=docker + +DEPLOY_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +APP_RUNNER_PROJECT_DIR=$(realpath "$DEPLOY_DIR/..") +BUILDER_IMAGE=local-build_app_runner:0.0.1 +$DOCKER build -t $BUILDER_IMAGE "$DEPLOY_DIR/builder" + +mkdir -p "$TARGET_DIR" +$DOCKER run \ + --user "$(id -u):$(id -g)" \ + --rm \ + --mount type=bind,source="$APP_RUNNER_PROJECT_DIR",target=/work/app_runner \ + --mount type=bind,source="$DEPLOY_DIR"/build_steps.sh,target=/work/build_steps.sh,readonly \ + --mount type=bind,source="$TARGET_DIR",target=/work/dist \ + --workdir /work/app_runner \ + "$BUILDER_IMAGE" \ + bash /work/build_steps.sh /work/dist "$TARGET_NAME" diff --git a/app_runner/deploy/build_steps.sh b/app_runner/deploy/build_steps.sh new file mode 100644 index 00000000..56b7dd04 --- /dev/null +++ b/app_runner/deploy/build_steps.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -euxo pipefail +TARGET_DIR="${1:-dist}" +TARGET_NAME="${2:-app_runner}" +rm -rf /work/venv +python -m venv /work/venv +source /work/venv/bin/activate +uv pip install . +uv pip install pyinstaller +pyinstaller -y --onedir --name "${TARGET_NAME}" --distpath "${TARGET_DIR}" src/app_runner/cli/__main__.py +deactivate diff --git a/app_runner/deploy/builder/Dockerfile b/app_runner/deploy/builder/Dockerfile new file mode 100644 index 00000000..aa1e1959 --- /dev/null +++ b/app_runner/deploy/builder/Dockerfile @@ -0,0 +1,25 @@ +ARG DEBIAN_VERSION=buster +FROM debian:${DEBIAN_VERSION} +ARG PYTHON_VERSION=3.13.0 + +LABEL org.opencontainers.image.authors="Leonardo Schwarz" + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get install -y curl git bash build-essential ccache \ + && apt-get install -y libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev curl git libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev + +RUN curl https://pyenv.run | bash + +ENV PATH=$PATH:/root/.pyenv/bin +RUN pyenv install $PYTHON_VERSION +RUN pyenv global $PYTHON_VERSION +ENV PATH=/root/.pyenv/versions/${PYTHON_VERSION}/bin:$PATH + +RUN pip install --root-user-action ignore uv pyinstaller +RUN chmod -R 0777 /root +RUN mkdir /work && chmod 0777 /work +RUN mkdir /home/user && chmod 0777 /home/user + +ENV HOME=/home/user +WORKDIR /work diff --git a/app_runner/docs/Makefile b/app_runner/docs/Makefile new file mode 100644 index 00000000..d4bb2cbb --- /dev/null +++ b/app_runner/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/app_runner/docs/architecture/overview.md b/app_runner/docs/architecture/overview.md new file mode 100644 index 00000000..dbb59975 --- /dev/null +++ b/app_runner/docs/architecture/overview.md @@ -0,0 +1,13 @@ +## Architecture Overview + +### App model + +```{eval-rst} +.. uml:: uml/app_model.plantuml +``` + +### App runner activity diagram + +```{eval-rst} +.. uml:: uml/app_runner_activity.plantuml +``` diff --git a/app_runner/docs/architecture/uml/app_model.plantuml b/app_runner/docs/architecture/uml/app_model.plantuml new file mode 100644 index 00000000..b5f6c307 --- /dev/null +++ b/app_runner/docs/architecture/uml/app_model.plantuml @@ -0,0 +1,41 @@ +@startuml + +participant BFabric +participant Storage +participant AppRunner +participant App + +BFabric -> AppRunner: run(app) + +group dispatch app + AppRunner -> App: dispatch(app) + App -> BFabric: query + BFabric -> App: response + App -> AppRunner: "chunks.yml", "chunk1/inputs.yml", ... +end + +loop for each chunk + group prepare inputs + AppRunner -> BFabric: query + BFabric -> AppRunner: response + Storage -> AppRunner: Copy Files + end + + group process chunk + AppRunner -> App: process("chunk1") + App -> AppRunner: notify + end + + group collect outputs + AppRunner -> App: collect("chunk1") + App -> BFabric: query + BFabric -> App: response + App -> AppRunner: "outputs.yml" + end + + AppRunner -> Storage: Copy Files + AppRunner -> BFabric: Save + +end + +@enduml diff --git a/app_runner/docs/architecture/uml/app_runner_activity.plantuml b/app_runner/docs/architecture/uml/app_runner_activity.plantuml new file mode 100644 index 00000000..c880c126 --- /dev/null +++ b/app_runner/docs/architecture/uml/app_runner_activity.plantuml @@ -0,0 +1,62 @@ +@startuml + title + App Runner Activity Diagram + end title + + start + :workunit_ref] + partition App Runner { + :Retrieve workunit and app information; + split + :workunit_definition.yml] + split again + :app_definition.yml] + note right + These are maintained in a + centralized repository. + end note + end split + + :Set workunit processing status; + :""app-runner app dispatch""; + note right + This step is supposed to be deterministic! + To allow distributing the tasks in the future. + end note + split + :tasks.yml] + split again + :task1/inputs.yml + task1/params.yml] + split again + :task2/inputs.yml + task2/params.yml] + end split + ' Unclear + ' :Precondition check; + :""app-runner chunk process-all""] + } + + note right + The actual ordering will be decided here. + tasks.yml declares task dependencies. + end note + fork + :Stage inputs 1; + partition App { + :Run task 1; + :outputs.yml] + } + :Register outputs; + fork again + :Stage inputs 2; + partition App { + :Run task 2; + :outputs.yml] + } + :Register outputs; + end fork + + :Set workunit available status; + stop +@enduml diff --git a/app_runner/docs/changelog.md b/app_runner/docs/changelog.md new file mode 100644 index 00000000..b1b68a6e --- /dev/null +++ b/app_runner/docs/changelog.md @@ -0,0 +1,56 @@ +# Changelog + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). + +## \[Unreleased\] + +## \[0.0.7\] - 2024-11-22 + +### Fixed + +- When executing `app run` the experimental entity cache created incorrect behavior. The caching is temporarily disabled, + until the issue is resolved. + +## \[0.0.6\] - 2024-11-14 + +First version with CD that will trigger the deployment automatically. + +### Fixed + +- Output spec was broken since `Path` was moved into `if TYPE_CHECKING` block. + +### Changed + +- The app spec is now strict and will fail parsing if there are any unknown fields in the spec. It is better to find + this type of error early. +- Log messages originating in `app_runner` should be printed now, they were previously muted (unintentionally). + +## \[0.0.5\] - 2024-11-11 + +### Added + +- `CommandDocker.mac_address`: allows to specify the MAC address of the container. +- `CommandDocker.custom_args`: allows to specify arbitrary additional arguments to the `docker run` command. + +## \[0.0.4\] - 2024-11-11 + +### Added + +- `MountOptions.writeable` list for writeable mount points. + +## \[0.0.3\] - 2024-10-24 + +### Added + +- Specify environment variables for docker container in spec. + +## \[0.0.2\] - 2024-10-23 + +### Added + +- App spec supports changing docker entrypoint. +- `bfabric-app-runner inputs check` to validate the local files + +### Fixed + +- `bfabric-app-runner inputs list` does not fail anymore if resources have no "name" field value. diff --git a/app_runner/docs/conf.py b/app_runner/docs/conf.py new file mode 100644 index 00000000..718cc811 --- /dev/null +++ b/app_runner/docs/conf.py @@ -0,0 +1,30 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +from pathlib import Path + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = "App Runner" +copyright = "2024 ETH Zurich" +author = "Leonardo Schwarz" + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = ["myst_parser", "sphinx.ext.autodoc", "sphinxcontrib.autodoc_pydantic", "sphinxcontrib.plantuml"] + +templates_path = ["_templates"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +plantuml = str((Path(__file__).parent / "plantuml_wrapper.sh").absolute()) +plantuml_output_format = "svg" + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = "sphinx_book_theme" +html_static_path = ["_static"] diff --git a/app_runner/docs/index.md b/app_runner/docs/index.md new file mode 100644 index 00000000..fc94d635 --- /dev/null +++ b/app_runner/docs/index.md @@ -0,0 +1,18 @@ +## Install App Runner + +```bash +pipx install app_runner@git+https://github.com/fgcz/bfabricPy.git@main#egg=app_runner&subdirectory=app_runner +``` + +## Contents + +```{toctree} +:glob: +workunit_definition +architecture/overview +specs/input_specification +specs/output_specification +specs/app_specification +changelog +* +``` diff --git a/app_runner/docs/make.bat b/app_runner/docs/make.bat new file mode 100644 index 00000000..32bb2452 --- /dev/null +++ b/app_runner/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/app_runner/docs/plantuml_wrapper.sh b/app_runner/docs/plantuml_wrapper.sh new file mode 100755 index 00000000..f21e1b93 --- /dev/null +++ b/app_runner/docs/plantuml_wrapper.sh @@ -0,0 +1,24 @@ +#!/bin/bash +set -e + +# Configuration +DOCKER="docker" +IMAGE="plantuml/plantuml:1.2024.7" + +# Ensure container exists +$DOCKER pull $IMAGE >/dev/null 2>&1 + +# Read PlantUML input +input=$(cat) + +# Check if input is empty +if [ -z "$input" ]; then + echo "Error: No PlantUML diagram provided via stdin" >&2 + exit 1 +fi + +# Run PlantUML in pipe mode +echo "$input" | $DOCKER run --rm -i \ + --user "$(id -u):$(id -g)" \ + $IMAGE \ + "$@" diff --git a/app_runner/docs/specs/app_specification.md b/app_runner/docs/specs/app_specification.md new file mode 100644 index 00000000..0e09086a --- /dev/null +++ b/app_runner/docs/specs/app_specification.md @@ -0,0 +1,13 @@ +## App specification + +TODO: not clear if this same document should also explain the individual steps, or if it would make sense to first +describe the app anatomy in a separate document with figures etc. and then list how to specify it + +## Reference + +```{eval-rst} +.. automodule:: app_runner.specs.app_spec + :members: + :undoc-members: + :show-inheritance: +``` diff --git a/app_runner/docs/specs/input_specification.md b/app_runner/docs/specs/input_specification.md new file mode 100644 index 00000000..2148c46a --- /dev/null +++ b/app_runner/docs/specs/input_specification.md @@ -0,0 +1,92 @@ +## Input specification + +The inputs module provides a specification schema to define the inputs required by an app. +You can also use this functionality interactively while prototyping. +The file is usually called `inputs.yml` and lists the different inputs, with information and how to retrieve them and +the filename to save them as. + +### General structure + +Generally the structure is a yaml file containing a key `inputs` which is a list of dictionaries, each representing an +input file. +Each input has a `type` key which identifies the input type. +This will allow us to extend this logic to different sources in the future. + +In general the only other input key that will be available for all types is `filename`, which is the name of the file to +save the input as. +Fields like `id` might not be relevant for all types in the future, and depending on the type more specific options +might exist. + +An example file could look like this: + +```yaml +# file: inputs.yml +inputs: + - type: bfabric_dataset + id: 53706 + filename: test.csv + - type: bfabric_resource + id: 2700958 + filename: test.zip +``` + +## Commands + +### Validation + +The input file can be validated with the command: + +```bash +bfabric-app-runner validate inputs-spec inputs.yml +``` + +Which on success will output a pretty-printed version of the inputs file. +Validation will also be performed by all other commands, so this is not strictly necessary. + +For instance, in the above case this would print: + +``` +InputsSpec( +│ inputs=[ +│ │ DatasetSpec(type='bfabric_dataset', id=53706, filename='test.csv', separator=','), +│ │ ResourceSpec(type='bfabric_resource', id=2700958, filename='test.zip', check_checksum=True) +│ ] +) +``` + +Here you can also see all the extra parameters which were implicitly set. + +### Prepare files + +The prepare command downloads your files and requires two arguments. +The first is the input file, and the second is the directory to save the files to. +In general to download to the current directory simply use `.` as the second argument: + +```bash +bfabric-app-runner inputs prepare inputs.yml . +``` + +If your files already exist and are up-to-date, it will not download them again. + +### List files + +You can list the files that are present or will be downloaded: + +```bash +bfabric-app-runner inputs list inputs.yml . +``` + +If you also want to check whether the files are up-to-date, you can pass the `--check` flag: + +```bash +bfabric-app-runner inputs list --check inputs.yml . +``` + +## Reference + +```{eval-rst} +.. automodule:: app_runner.specs.inputs_spec + :members: + :undoc-members: + :show-inheritance: +``` diff --git a/app_runner/docs/specs/output_specification.md b/app_runner/docs/specs/output_specification.md new file mode 100644 index 00000000..668f95b3 --- /dev/null +++ b/app_runner/docs/specs/output_specification.md @@ -0,0 +1,60 @@ +## Output specification + +The outputs module provides a specification schema to define the outputs that were created by an app and should be registered. +The file is usually called `outputs.yml` and lists the different output files, with information how to register them. + +### General structure + +Generally the structure is a yaml file containing a key `outputs` which is a list of dictionaries, each representing an +output file. +Each output has a `type` key which identifies the output type. +This will allow us to extend this logic to different sources in the future. + +An example file could look like: + +```yaml +outputs: +- type: bfabric_copy_resource + local_path: /tmp/work/hello.txt + store_entry_path: WU123456_hello.txt +- type: bfabric_dataset + local_path: /tmp/work/hello.csv + separator: "," + name: Hello Dataset +``` + +## Commands + +### Validation + +The output file can be validated with the command: + +```bash +bfabric-app-runner validate outputs-spec outputs.yml +``` + +Which on success will output a pretty-printed version of the outputs file. +Validation will also be performed by all other commands, so this is not strictly necessary. + +### Register files + +To perform the registration to B-Fabric the following can be used: + +```bash +bfabric-app-runner outputs register outputs.yml --workunit-id 1234 +``` + +Please note: + +- The workunit ID needs to be specified, so the correct information can be retrieved. (TODO but instead of the workunit id it should also be possible to pass the ref) +- Several actions might require a particular user to be possible, e.g. the `bfabric_copy_resource` will require a user + with permission to create the particular file over SSH. + +## Reference + +```{eval-rst} +.. automodule:: app_runner.specs.outputs_spec + :members: + :undoc-members: + :show-inheritance: +``` diff --git a/app_runner/docs/workunit_definition.md b/app_runner/docs/workunit_definition.md new file mode 100644 index 00000000..06e08e1b --- /dev/null +++ b/app_runner/docs/workunit_definition.md @@ -0,0 +1,32 @@ +## Workunit Definition + +The idea of the workunit definition is to provide a persistable and comprehensive description of a workunit. +To keep the logic even more modular it is separated into two components, the `execution` and the `registration` +information. + +### Creating WorkunitDefinition instances + +The `WorkunitDefinition` class is a Pydantic model and can be created by passing a dictionary to the constructor. +However, for convenience and easier integration into command line tools there is a constructor for both creating an +instance from a Bfabric entity, and parsing a YAML file which contains a persisted version of the workunit + +### Workunit references + +Several functions and command line tools allow providing a "workunit reference". This means, that either the ID or a +path to a local YAML file can be passed to this function. +If the input is a path, then the persisted information will be retrieved to instantiate a `WorkunitDefinition` instance, +whereas if it is an integer, the information will be obtained by querying the B-Fabric API. + +Since in some workflows the workunit will be used several times, and in particular not necessarily in the same process, +the usual entity caching mechanism might not be able to cache the requests. +Therefore, in many cases passing a reference to a YAML file is the preferred way to provide the workunit information, +as it will reduce the number of requests to the B-Fabric API (sometimes even to zero). + +### Reference + +```{eval-rst} +.. automodule:: bfabric.experimental.workunit_definition + :members: + :undoc-members: + :show-inheritance: +``` diff --git a/app_runner/pyproject.toml b/app_runner/pyproject.toml new file mode 100644 index 00000000..33dc7a80 --- /dev/null +++ b/app_runner/pyproject.toml @@ -0,0 +1,55 @@ +[build-system] +requires = ["setuptools >= 61.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "app_runner" +description = "Application runner for B-Fabric apps" +version = "0.0.6" +license = { text = "GPL-3.0" } +authors = [ + {name = "Leonardo Schwarz", email = "leonardo.schwarz@fgcz.ethz.ch"}, +] +requires-python = ">=3.12" +dependencies = [ + "bfabric @ git+https://github.com/fgcz/bfabricPy.git@main", + "pydantic", + "glom", +] + +[project.scripts] +"bfabric-app-runner"="app_runner.cli.__main__:app" + +[project.optional-dependencies] +doc = [ + "sphinx", + "myst-parser", + "autodoc_pydantic", + "sphinx-book-theme", + "sphinx-autobuild", + "sphinxcontrib-plantuml", +] +dev = [ + "app_runner[doc]", + "bfabric[dev, typing]", + "mypy", + "types-PyYAML", +] + +[tool.uv] +reinstall-package = ["app_runner"] + +[tool.black] +line-length = 120 + +[tool.ruff] +line-length = 120 +indent-width = 4 + +[tool.ruff.lint] +select = ["ANN", "BLE", "D103", "E", "EXE", "F", "N", "PLW", "PTH", "SIM", "TCH", "UP", "W191"] +ignore = ["ANN101", "ANN102"] + +[tool.ruff.lint.per-file-ignores] +# This is needed because of false positives in cyclopts code +"**/app_runner/cli/**" = ["TCH001", "TCH002", "TCH003"] diff --git a/src/bfabric/experimental/app_interface/app_runner/__init__.py b/app_runner/src/app_runner/__init__.py similarity index 100% rename from src/bfabric/experimental/app_interface/app_runner/__init__.py rename to app_runner/src/app_runner/__init__.py diff --git a/src/bfabric/experimental/app_interface/cli/__init__.py b/app_runner/src/app_runner/app_runner/__init__.py similarity index 100% rename from src/bfabric/experimental/app_interface/cli/__init__.py rename to app_runner/src/app_runner/app_runner/__init__.py diff --git a/src/bfabric/experimental/app_interface/app_runner/runner.py b/app_runner/src/app_runner/app_runner/runner.py similarity index 65% rename from src/bfabric/experimental/app_interface/app_runner/runner.py rename to app_runner/src/app_runner/app_runner/runner.py index 03f9514a..abfc9385 100644 --- a/src/bfabric/experimental/app_interface/app_runner/runner.py +++ b/app_runner/src/app_runner/app_runner/runner.py @@ -3,16 +3,19 @@ import shlex import subprocess from pathlib import Path -from loguru import logger +from typing import TYPE_CHECKING import yaml +from bfabric.experimental.workunit_definition import WorkunitDefinition +from loguru import logger from pydantic import BaseModel -from bfabric import Bfabric -from bfabric.experimental.app_interface.app_runner._spec import AppSpec -from bfabric.experimental.app_interface.input_preparation import prepare_folder -from bfabric.experimental.app_interface.output_registration import register_outputs -from bfabric.experimental.app_interface.workunit.definition import WorkunitDefinition +from app_runner.input_preparation import prepare_folder +from app_runner.output_registration import register_outputs + +if TYPE_CHECKING: + from app_runner.specs.app_spec import AppSpec + from bfabric import Bfabric class Runner: @@ -43,9 +46,13 @@ def run_process(self, chunk_dir: Path) -> None: def run_register_outputs(self, chunk_dir: Path, workunit_ref: int | Path, reuse_default_resource: bool) -> None: workunit_definition = WorkunitDefinition.from_ref(workunit_ref, client=self._client) + registration = workunit_definition.registration + if registration is None: + msg = "Workunit definition does not provide registration information" + raise ValueError(msg) register_outputs( outputs_yaml=chunk_dir / "outputs.yml", - workunit_id=workunit_definition.registration.workunit_id, + workunit_id=registration.workunit_id, client=self._client, ssh_user=self._ssh_user, reuse_default_resource=reuse_default_resource, @@ -66,24 +73,35 @@ def run_app( read_only: bool = False, dispatch_active: bool = True, ) -> None: - # TODO future: the workunit definition must be loaded from bfabric exactly once! this is quite inefficient right now - workunit_definition = WorkunitDefinition.from_ref(workunit_ref, client=client) + """Executes all steps of the provided app.""" + # TODO would it be possible, to reuse the individual steps commands so there is certainly only one definition? + work_dir = work_dir.resolve() + workunit_ref = workunit_ref.resolve() if isinstance(workunit_ref, Path) else workunit_ref + + workunit_definition_file = work_dir / "workunit_definition.yml" + workunit_definition = WorkunitDefinition.from_ref( + workunit=workunit_ref, client=client, cache_file=workunit_definition_file + ) if not read_only: + # Set the workunit status to processing client.save("workunit", {"id": workunit_definition.registration.workunit_id, "status": "processing"}) runner = Runner(spec=app_spec, client=client, ssh_user=ssh_user) if dispatch_active: - runner.run_dispatch(workunit_ref=workunit_ref, work_dir=work_dir) + runner.run_dispatch(workunit_ref=workunit_definition_file, work_dir=work_dir) chunks_file = ChunksFile.model_validate(yaml.safe_load((work_dir / "chunks.yml").read_text())) for chunk in chunks_file.chunks: logger.info(f"Processing chunk {chunk}") runner.run_prepare_input(chunk_dir=chunk) runner.run_process(chunk_dir=chunk) - runner.run_collect(workunit_ref=workunit_ref, chunk_dir=chunk) + runner.run_collect(workunit_ref=workunit_definition_file, chunk_dir=chunk) if not read_only: runner.run_register_outputs( - chunk_dir=chunk, workunit_ref=workunit_ref, reuse_default_resource=app_spec.reuse_default_resource + chunk_dir=chunk, + workunit_ref=workunit_definition_file, + reuse_default_resource=app_spec.reuse_default_resource, ) if not read_only: + # Set the workunit status to available client.save("workunit", {"id": workunit_definition.registration.workunit_id, "status": "available"}) diff --git a/src/bfabric/experimental/app_interface/dispatch/__init__.py b/app_runner/src/app_runner/cli/__init__.py similarity index 100% rename from src/bfabric/experimental/app_interface/dispatch/__init__.py rename to app_runner/src/app_runner/cli/__init__.py diff --git a/app_runner/src/app_runner/cli/__main__.py b/app_runner/src/app_runner/cli/__main__.py new file mode 100644 index 00000000..b5fc0756 --- /dev/null +++ b/app_runner/src/app_runner/cli/__main__.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +import importlib.metadata + +import cyclopts + +from app_runner.cli.app import app_app +from app_runner.cli.chunk import app_chunk +from app_runner.cli.inputs import app_inputs +from app_runner.cli.outputs import app_outputs +from app_runner.cli.validate import app_validate + +package_version = importlib.metadata.version("app_runner") + +app = cyclopts.App( + help="Provides an entrypoint to app execution.\n\nFunctionality/API under active development!", + version=package_version, +) +app.command(app_inputs) +app.command(app_outputs) +app.command(app_app) +app.command(app_chunk) +app.command(app_validate) + +if __name__ == "__main__": + app() diff --git a/src/bfabric/experimental/app_interface/cli/app.py b/app_runner/src/app_runner/cli/app.py similarity index 63% rename from src/bfabric/experimental/app_interface/cli/app.py rename to app_runner/src/app_runner/cli/app.py index d81d953c..3e060a47 100644 --- a/src/bfabric/experimental/app_interface/cli/app.py +++ b/app_runner/src/app_runner/cli/app.py @@ -5,10 +5,11 @@ import cyclopts import yaml +from app_runner.specs.app_spec import AppSpec +from app_runner.app_runner.runner import run_app, Runner from bfabric import Bfabric from bfabric.cli_formatting import setup_script_logging -from bfabric.experimental.app_interface.app_runner._spec import AppSpec -from bfabric.experimental.app_interface.app_runner.runner import run_app, Runner +from bfabric.experimental.entity_lookup_cache import EntityLookupCache app_app = cyclopts.App("app", help="Run an app.") @@ -16,7 +17,7 @@ @app_app.command() def run( app_spec: Path, - target_folder: Path, + work_dir: Path, workunit_ref: int | Path, *, ssh_user: str | None = None, @@ -27,10 +28,13 @@ def run( setup_script_logging() client = Bfabric.from_config() app_spec_parsed = AppSpec.model_validate(yaml.safe_load(app_spec.read_text())) + # TODO(#107): usage of entity lookup cache was problematic -> beyond the full solution we could also consider + # to deactivate the cache for the output registration + # with EntityLookupCache.enable(): run_app( app_spec=app_spec_parsed, workunit_ref=workunit_ref, - work_dir=target_folder, + work_dir=work_dir, client=client, ssh_user=ssh_user, read_only=read_only, @@ -50,7 +54,9 @@ def dispatch( :param workunit_ref: Reference to the workunit (ID or YAML file path). """ setup_script_logging() + work_dir = work_dir.resolve() # TODO set workunit to processing? (i.e. add read-only option here) client = Bfabric.from_config() - runner = Runner(spec=AppSpec.model_validate(yaml.safe_load(app_spec.read_text())), client=client, ssh_user=None) - runner.run_dispatch(workunit_ref=workunit_ref, work_dir=work_dir) + with EntityLookupCache.enable(): + runner = Runner(spec=AppSpec.model_validate(yaml.safe_load(app_spec.read_text())), client=client, ssh_user=None) + runner.run_dispatch(workunit_ref=workunit_ref, work_dir=work_dir) diff --git a/src/bfabric/experimental/app_interface/cli/chunk.py b/app_runner/src/app_runner/cli/chunk.py similarity index 76% rename from src/bfabric/experimental/app_interface/cli/chunk.py rename to app_runner/src/app_runner/cli/chunk.py index 5392dd97..fc6a6060 100644 --- a/src/bfabric/experimental/app_interface/cli/chunk.py +++ b/app_runner/src/app_runner/cli/chunk.py @@ -5,10 +5,11 @@ import cyclopts import yaml +from app_runner.specs.app_spec import AppSpec +from app_runner.app_runner.runner import run_app, Runner from bfabric import Bfabric from bfabric.cli_formatting import setup_script_logging -from bfabric.experimental.app_interface.app_runner._spec import AppSpec -from bfabric.experimental.app_interface.app_runner.runner import run_app, Runner +from bfabric.experimental.entity_lookup_cache import EntityLookupCache app_chunk = cyclopts.App("chunk", help="Run an app on a chunk. You can create the chunks with `app dispatch`.") @@ -45,27 +46,6 @@ def run_all( ) -@app_chunk.command() -def inputs( - app_spec: Path, - chunk_dir: Path, - *, - ssh_user: str | None = None, -) -> None: - """Prepare the input files for a chunk. - - :param app_spec: Path to the app spec file. - :param chunk_dir: Path to the chunk directory. - :param ssh_user: SSH user to use for downloading the input files, instead of the current user. - """ - setup_script_logging() - client = Bfabric.from_config() - app_spec_parsed = AppSpec.model_validate(yaml.safe_load(app_spec.read_text())) - - runner = Runner(spec=app_spec_parsed, client=client, ssh_user=ssh_user) - runner.run_prepare_input(chunk_dir=chunk_dir) - - @app_chunk.command() def process(app_spec: Path, chunk_dir: Path) -> None: """Process a chunk. @@ -76,10 +56,12 @@ def process(app_spec: Path, chunk_dir: Path) -> None: """ setup_script_logging() client = Bfabric.from_config() + chunk_dir = chunk_dir.resolve() app_spec_parsed = AppSpec.model_validate(yaml.safe_load(app_spec.read_text())) - runner = Runner(spec=app_spec_parsed, client=client, ssh_user=None) - runner.run_process(chunk_dir=chunk_dir) + with EntityLookupCache.enable(): + runner = Runner(spec=app_spec_parsed, client=client, ssh_user=None) + runner.run_process(chunk_dir=chunk_dir) @app_chunk.command() @@ -103,6 +85,7 @@ def outputs( """ setup_script_logging() client = Bfabric.from_config() + chunk_dir = chunk_dir.resolve() app_spec_parsed = AppSpec.model_validate(yaml.safe_load(app_spec.read_text())) runner = Runner(spec=app_spec_parsed, client=client, ssh_user=ssh_user) diff --git a/app_runner/src/app_runner/cli/inputs.py b/app_runner/src/app_runner/cli/inputs.py new file mode 100644 index 00000000..e7518877 --- /dev/null +++ b/app_runner/src/app_runner/cli/inputs.py @@ -0,0 +1,120 @@ +from __future__ import annotations + +from pathlib import Path + +import cyclopts + +from app_runner.input_preparation import prepare_folder +from app_runner.input_preparation.integrity import IntegrityState +from app_runner.input_preparation.list_inputs import ( + list_input_states, + print_input_states, + FileState, +) +from app_runner.specs.inputs_spec import InputsSpec +from bfabric import Bfabric +from bfabric.cli_formatting import setup_script_logging + +app_inputs = cyclopts.App("inputs", help="Prepare input files for an app.") + + +@app_inputs.command() +def prepare( + inputs_yaml: Path, + target_folder: Path | None = None, + *, + ssh_user: str | None = None, +) -> None: + """Prepare the input files by downloading them (if necessary). + + :param inputs_yaml: Path to the inputs.yml file. + :param target_folder: Path to the target folder where the input files should be downloaded. + :param ssh_user: SSH user to use for downloading the input files, instead of the current user. + """ + setup_script_logging() + client = Bfabric.from_config() + prepare_folder( + inputs_yaml=inputs_yaml, + target_folder=target_folder, + ssh_user=ssh_user, + client=client, + action="prepare", + ) + + +@app_inputs.command() +def clean( + inputs_yaml: Path, + target_folder: Path | None = None, +) -> None: + """Removes all local copies of input files. + + :param inputs_yaml: Path to the inputs.yml file. + :param target_folder: Path to the target folder where the input files should be removed. + """ + setup_script_logging() + client = Bfabric.from_config() + # TODO clean shouldn't even need all these arguments, this could be refactored later + prepare_folder( + inputs_yaml=inputs_yaml, + target_folder=target_folder, + ssh_user=None, + action="clean", + client=client, + ) + + +def get_inputs_and_print( + inputs_yaml: Path, + target_folder: Path | None, + check: bool, +) -> list[FileState]: + """Reads the input files, performing integrity checks if requested, and prints the results.""" + client = Bfabric.from_config() + input_states = list_input_states( + specs=InputsSpec.read_yaml(inputs_yaml), + target_folder=target_folder or Path(), + client=client, + check_files=check, + ) + print_input_states(input_states) + return input_states + + +@app_inputs.command(name="list") +def list_( + inputs_yaml: Path, + target_folder: Path | None = None, + check: bool = False, +) -> None: + """Lists the input files for an app. + + :param inputs_yaml: Path to the inputs.yml file. + :param target_folder: Path to the target folder where the input files should be located, if different from the + file containing the inputs.yml file. + """ + setup_script_logging() + get_inputs_and_print(inputs_yaml=inputs_yaml, target_folder=target_folder, check=check) + + +@app_inputs.command() +def check( + inputs_yaml: Path, + target_folder: Path | None = None, +) -> None: + """Checks if the input files are present and have the correct content. + + The script will exit with a non-zero status + code if any of the input files are missing or have incorrect content. + :param inputs_yaml: Path to the inputs.yml file. + :param target_folder: Path to the target folder where the input files should be located, if different from the + file containing the inputs.yml file. + """ + setup_script_logging() + input_states = get_inputs_and_print(inputs_yaml=inputs_yaml, target_folder=target_folder, check=True) + invalid_states = {state.integrity for state in input_states if state.integrity != IntegrityState.Correct} + if invalid_states: + print(f"Encountered invalid input states: {invalid_states}") + raise SystemExit(1) + else: + print("All input files are correct.") diff --git a/src/bfabric/experimental/app_interface/cli/outputs.py b/app_runner/src/app_runner/cli/outputs.py similarity index 80% rename from src/bfabric/experimental/app_interface/cli/outputs.py rename to app_runner/src/app_runner/cli/outputs.py index 2a656d54..5e9e42a3 100644 --- a/src/bfabric/experimental/app_interface/cli/outputs.py +++ b/app_runner/src/app_runner/cli/outputs.py @@ -4,11 +4,11 @@ import cyclopts +from app_runner.output_registration.register import register_all +from app_runner.specs.outputs_spec import OutputsSpec from bfabric import Bfabric from bfabric.cli_formatting import setup_script_logging from bfabric.entities import Workunit -from bfabric.experimental.app_interface.output_registration._spec import OutputsSpec -from bfabric.experimental.app_interface.output_registration.register import register_all app_outputs = cyclopts.App("outputs", help="Register output files for an app.") @@ -29,6 +29,10 @@ def register( specs_list = OutputsSpec.read_yaml(outputs_yaml) workunit = Workunit.find(id=workunit_id, client=client) + if workunit is None: + msg = f"Workunit with id {workunit_id} not found" + raise ValueError(msg) + register_all( client=client, workunit=workunit, diff --git a/src/bfabric/experimental/app_interface/cli/validate.py b/app_runner/src/app_runner/cli/validate.py similarity index 78% rename from src/bfabric/experimental/app_interface/cli/validate.py rename to app_runner/src/app_runner/cli/validate.py index ecb6e527..514bd2c1 100644 --- a/src/bfabric/experimental/app_interface/cli/validate.py +++ b/app_runner/src/app_runner/cli/validate.py @@ -7,9 +7,9 @@ import rich.pretty import yaml -from bfabric.experimental.app_interface.app_runner._spec import AppSpec -from bfabric.experimental.app_interface.input_preparation._spec import InputsSpec -from bfabric.experimental.app_interface.output_registration._spec import OutputsSpec +from app_runner.specs.app_spec import AppSpec +from app_runner.specs.inputs_spec import InputsSpec +from app_runner.specs.outputs_spec import OutputsSpec app_validate = cyclopts.App("validate", help="Validate yaml files.") diff --git a/src/bfabric/experimental/app_interface/util/__init__.py b/app_runner/src/app_runner/dispatch/__init__.py similarity index 100% rename from src/bfabric/experimental/app_interface/util/__init__.py rename to app_runner/src/app_runner/dispatch/__init__.py diff --git a/src/bfabric/experimental/app_interface/dispatch/dispatch_individual_resources.py b/app_runner/src/app_runner/dispatch/dispatch_individual_resources.py similarity index 62% rename from src/bfabric/experimental/app_interface/dispatch/dispatch_individual_resources.py rename to app_runner/src/app_runner/dispatch/dispatch_individual_resources.py index 21560c20..4e29f50c 100644 --- a/src/bfabric/experimental/app_interface/dispatch/dispatch_individual_resources.py +++ b/app_runner/src/app_runner/dispatch/dispatch_individual_resources.py @@ -1,15 +1,17 @@ from __future__ import annotations -from pathlib import Path -from typing import Any, Self +from typing import Any, TYPE_CHECKING -import yaml -from loguru import logger from pydantic import BaseModel, ConfigDict, model_validator -from bfabric import Bfabric +from app_runner.dispatch.generic import write_workunit_definition_file, write_chunks_file +from app_runner.dispatch.resource_flow import get_resource_flow_input_resources from bfabric.entities import Resource, Dataset -from bfabric.experimental.app_interface.workunit.definition import WorkunitDefinition + +if TYPE_CHECKING: + from pathlib import Path + from bfabric import Bfabric + from bfabric.experimental.workunit_definition import WorkunitDefinition class ConfigResourceFlow(BaseModel): @@ -28,13 +30,20 @@ class ConfigDispatchIndividualResources(BaseModel): dataset_flow: ConfigDatasetFlow | None @model_validator(mode="after") - def check_at_least_one_flow(self) -> Self: + def check_at_least_one_flow(self) -> ConfigDispatchIndividualResources: if self.resource_flow is None and self.dataset_flow is None: raise ValueError("either resource_flow or dataset_flow must be provided") return self def config_msi_imzml() -> ConfigDispatchIndividualResources: + """Returns the configuration for dispatching MSI imzML datasets to chunks. + + These apps allow both being run with a list of input `.imzML` resource files, or a dataset which contains a column + `Imzml` with the resource IDs and a column `PanelDataset` with the dataset IDs. + + Note: In the future the specifics of this might be adapted to allow e.g. `.imzML.7z` files or similar. + """ return ConfigDispatchIndividualResources( resource_flow=ConfigResourceFlow(filter_suffix=".imzML"), dataset_flow=ConfigDatasetFlow(resource_column="Imzml", param_columns=[("PanelDataset", "mass_list_id")]), @@ -65,47 +74,32 @@ def dispatch_workunit(self, definition: WorkunitDefinition) -> None: paths = self._dispatch_jobs_dataset_flow(definition, params) else: raise ValueError("either dataset or resources must be provided") - self._write_workunit_definition(definition=definition) - self._write_chunks(chunks=paths) - - def _write_workunit_definition(self, definition: WorkunitDefinition) -> None: - self._out_dir.mkdir(exist_ok=True, parents=True) - with (self._out_dir / "workunit_definition.yml").open("w") as f: - yaml.safe_dump(definition.model_dump(mode="json"), f) - - def _write_chunks(self, chunks: list[Path]) -> None: - self._out_dir.mkdir(exist_ok=True, parents=True) - with (self._out_dir / "chunks.yml").open("w") as f: - data = {"chunks": [str(chunk) for chunk in chunks]} - yaml.safe_dump(data, f) + write_workunit_definition_file(out_dir=self._out_dir, definition=definition) + write_chunks_file(out_dir=self._out_dir, chunks=paths) def _dispatch_jobs_resource_flow(self, definition: WorkunitDefinition, params: dict[str, Any]) -> list[Path]: - if self._config.resource_flow is None: + """Returns the individual jobs for a resource flow workunit and returns the paths of the task folders.""" + config = self._config.resource_flow + if config is None: raise ValueError("resource_flow is not configured") - resources = Resource.find_all(ids=definition.execution.resources, client=self._client) - paths = [] - for resource in sorted(resources.values()): - if self._config.resource_flow.filter_suffix is not None and not resource["relativepath"].endswith( - self._config.resource_flow.filter_suffix - ): - logger.info( - f"Skipping resource {resource['relativepath']!r} as it does not match the extension filter." - ) - continue - paths.append(self.dispatch_job(resource=resource, params=params)) - return paths + resources = get_resource_flow_input_resources( + client=self._client, definition=definition, filter_suffix=config.filter_suffix + ) + return [self.dispatch_job(resource=resource, params=params) for resource in resources] def _dispatch_jobs_dataset_flow(self, definition: WorkunitDefinition, params: dict[str, Any]) -> list[Path]: - if self._config.dataset_flow is None: + config = self._config.dataset_flow + if config is None: raise ValueError("dataset_flow is not configured") dataset = Dataset.find(id=definition.execution.dataset, client=self._client) + if dataset is None: + msg = f"Dataset with id {definition.execution.dataset} not found" + raise ValueError(msg) dataset_df = dataset.to_polars() - resources = Resource.find_all( - ids=dataset_df[self._config.dataset_flow.resource_column].unique().to_list(), client=self._client - ) + resources = Resource.find_all(ids=dataset_df[config.resource_column].unique().to_list(), client=self._client) paths = [] for row in dataset_df.iter_rows(named=True): - resource_id = int(row[self._config.dataset_flow.resource_column]) - row_params = {name: row[dataset_name] for dataset_name, name in self._config.dataset_flow.param_columns} + resource_id = int(row[config.resource_column]) + row_params = {name: row[dataset_name] for dataset_name, name in config.param_columns} paths.append(self.dispatch_job(resource=resources[resource_id], params=params | row_params)) return paths diff --git a/app_runner/src/app_runner/dispatch/generic.py b/app_runner/src/app_runner/dispatch/generic.py new file mode 100644 index 00000000..b0e50f2b --- /dev/null +++ b/app_runner/src/app_runner/dispatch/generic.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import yaml + +if TYPE_CHECKING: + from pathlib import Path + from bfabric.experimental.workunit_definition import WorkunitDefinition + + +def write_workunit_definition_file(out_dir: Path, definition: WorkunitDefinition) -> None: + """Writes the workunit definition to the output directory's 'workunit_definition.yml'.""" + out_dir.mkdir(exist_ok=True, parents=True) + with (out_dir / "workunit_definition.yml").open("w") as f: + yaml.safe_dump(definition.model_dump(mode="json"), f) + + +def write_chunks_file(out_dir: Path, chunks: list[Path]) -> None: + """Writes the list of chunk paths to the output directory's 'chunks.yml'.""" + out_dir.mkdir(exist_ok=True, parents=True) + with (out_dir / "chunks.yml").open("w") as f: + data = {"chunks": [str(chunk) for chunk in chunks]} + yaml.safe_dump(data, f) diff --git a/app_runner/src/app_runner/dispatch/resource_flow.py b/app_runner/src/app_runner/dispatch/resource_flow.py new file mode 100644 index 00000000..a3d9f6a0 --- /dev/null +++ b/app_runner/src/app_runner/dispatch/resource_flow.py @@ -0,0 +1,21 @@ +from loguru import logger + +from bfabric import Bfabric +from bfabric.entities import Resource +from bfabric.experimental.workunit_definition import WorkunitDefinition + + +def get_resource_flow_input_resources( + client: Bfabric, + definition: WorkunitDefinition, + filter_suffix: str | None, +) -> list[Resource]: + """Returns the input resources for a resource flow workunit, applying e.g. a filter suffix.""" + all_resources = Resource.find_all(ids=definition.execution.resources, client=client) + result_resources = [] + for resource in sorted(all_resources.values()): + if filter_suffix is not None and not resource["relativepath"].endswith(filter_suffix): + logger.info(f"Skipping resource {resource['relativepath']!r} as it does not match the extension filter.") + continue + result_resources.append(resource) + return result_resources diff --git a/src/bfabric/experimental/app_interface/input_preparation/__init__.py b/app_runner/src/app_runner/input_preparation/__init__.py similarity index 100% rename from src/bfabric/experimental/app_interface/input_preparation/__init__.py rename to app_runner/src/app_runner/input_preparation/__init__.py diff --git a/app_runner/src/app_runner/input_preparation/integrity.py b/app_runner/src/app_runner/input_preparation/integrity.py new file mode 100644 index 00000000..b6bf9d33 --- /dev/null +++ b/app_runner/src/app_runner/input_preparation/integrity.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +from enum import Enum + +from bfabric.entities import Resource, Dataset +from app_runner.specs.inputs_spec import InputSpecType, ResourceSpec, DatasetSpec +from app_runner.util.checksums import md5sum +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from pathlib import Path + from bfabric.bfabric import Bfabric + + +class IntegrityState(Enum): + """ + TODO basically this: enum(Missing, Exists(NOT_CHECKED, CORRECT, INCORRECT)) + """ + + Missing = "Missing" + NotChecked = "NotChecked" + Correct = "Correct" + Incorrect = "Incorrect" + + def exists(self) -> bool: + return self != IntegrityState.Missing + + +def check_integrity(spec: InputSpecType, local_path: Path, client: Bfabric) -> IntegrityState: + """Checks the integrity of a local file against the spec.""" + if not local_path.exists(): + return IntegrityState.Missing + + if isinstance(spec, ResourceSpec): + return _check_resource_spec(spec, local_path, client) + elif isinstance(spec, DatasetSpec): + return _check_dataset_spec(spec, local_path, client) + else: + raise ValueError(f"Unsupported spec type: {type(spec)}") + + +def _check_resource_spec(spec: ResourceSpec, local_path: Path, client: Bfabric) -> IntegrityState: + expected_checksum = Resource.find(id=spec.id, client=client)["filechecksum"] + if expected_checksum == md5sum(local_path): + return IntegrityState.Correct + else: + return IntegrityState.Incorrect + + +def _check_dataset_spec(spec: DatasetSpec, local_path: Path, client: Bfabric) -> IntegrityState: + dataset = Dataset.find(id=spec.id, client=client) + is_identical = local_path.read_text().strip() == dataset.get_csv(separator=spec.separator).strip() + return IntegrityState.Correct if is_identical else IntegrityState.Incorrect diff --git a/app_runner/src/app_runner/input_preparation/list_inputs.py b/app_runner/src/app_runner/input_preparation/list_inputs.py new file mode 100644 index 00000000..68cb81d1 --- /dev/null +++ b/app_runner/src/app_runner/input_preparation/list_inputs.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from dataclasses import dataclass + +from rich.console import Console +from rich.table import Table, Column + +from app_runner.input_preparation.integrity import check_integrity, IntegrityState +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from app_runner.specs.inputs_spec import InputSpecType + from pathlib import Path + from bfabric.bfabric import Bfabric + + +@dataclass +class FileState: + name: str + path: Path + type: str + exists: bool + integrity: IntegrityState + + +def list_input_states( + specs: list[InputSpecType], + target_folder: Path, + client: Bfabric, + check_files: bool, +) -> list[FileState]: + """Returns the states of the input files, performing integrity checks if requested.""" + input_states = [] + for spec in specs: + filename = spec.resolve_filename(client=client) + path = target_folder / filename + exists = path.exists() + if not check_files: + integrity = IntegrityState.NotChecked + else: + integrity = check_integrity(spec=spec, local_path=path, client=client) + input_states.append(FileState(name=filename, path=path, exists=exists, integrity=integrity, type=spec.type)) + return input_states + + +def print_input_states(input_states: list[FileState]) -> None: + """Prints the states of the input files to the command line.""" + table = Table( + Column("File"), + Column("Input Type"), + Column("Exists"), + Column("Integrity"), + ) + for state in input_states: + table.add_row( + str(state.name), + str(state.type), + {True: "Yes", False: "No"}[state.exists], + state.integrity.value, + ) + console = Console() + console.print(table) diff --git a/app_runner/src/app_runner/input_preparation/prepare.py b/app_runner/src/app_runner/input_preparation/prepare.py new file mode 100644 index 00000000..a67147d5 --- /dev/null +++ b/app_runner/src/app_runner/input_preparation/prepare.py @@ -0,0 +1,136 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal + +from bfabric.entities import Resource, Dataset +from loguru import logger + +from app_runner.input_preparation.integrity import IntegrityState +from app_runner.input_preparation.list_inputs import list_input_states +from app_runner.specs.inputs_spec import ( + ResourceSpec, + DatasetSpec, + InputSpecType, + InputsSpec, +) +from app_runner.util.checksums import md5sum +from app_runner.util.scp import scp + +if TYPE_CHECKING: + from pathlib import Path + from bfabric.bfabric import Bfabric + + +class PrepareInputs: + def __init__(self, client: Bfabric, working_dir: Path, ssh_user: str | None) -> None: + self._client = client + self._working_dir = working_dir + self._ssh_user = ssh_user + + def prepare_all(self, specs: list[InputSpecType]) -> None: + # TODO ensure dataset is cached + input_states = list_input_states( + specs=specs, target_folder=self._working_dir, client=self._client, check_files=True + ) + for spec, input_state in zip(specs, input_states): + if input_state.integrity == IntegrityState.Correct: + logger.debug(f"Skipping {spec} as it already exists and passed integrity check") + elif isinstance(spec, ResourceSpec): + self.prepare_resource(spec) + elif isinstance(spec, DatasetSpec): + self.prepare_dataset(spec) + else: + raise ValueError(f"Unsupported spec type: {type(spec)}") + + def clean_all(self, specs: list[InputSpecType]) -> None: + input_states = list_input_states( + specs=specs, target_folder=self._working_dir, client=self._client, check_files=False + ) + for spec, input_state in zip(specs, input_states): + if not input_state.exists: + logger.debug(f"Skipping {spec} as it does not exist") + else: + logger.info(f"rm {input_state.path}") + input_state.path.unlink() + + def prepare_resource(self, spec: ResourceSpec) -> None: + resource = Resource.find(id=spec.id, client=self._client) + if resource is None: + msg = f"Resource with id {spec.id} not found" + raise ValueError(msg) + + # determine path to copy from + # TODO as we have seen sometimes a faster approach would be to copy from the NFS mount, but this needs to be + # configured or recognized somehow + scp_uri = f"{resource.storage.scp_prefix}{resource['relativepath']}" + + # determine path to copy to + result_name = spec.filename if spec.filename else resource["name"] + result_path = self._working_dir / result_name + + # perform the copy + scp(scp_uri, str(result_path), user=self._ssh_user) + + # verify checksum + if spec.check_checksum: + actual_checksum = md5sum(result_path) + logger.debug(f"Checksum: expected {resource['filechecksum']}, got {actual_checksum}") + if actual_checksum != resource["filechecksum"]: + raise ValueError(f"Checksum mismatch: expected {resource['filechecksum']}, got {actual_checksum}") + + def prepare_dataset(self, spec: DatasetSpec) -> None: + dataset = Dataset.find(id=spec.id, client=self._client) + # TODO use the new functionality Dataset.get_csv (or even go further in the refactoring) + target_path = self._working_dir / spec.filename + target_path.parent.mkdir(exist_ok=True, parents=True) + dataset.write_csv(path=target_path, separator=spec.separator) + + def clean_resource(self, spec: ResourceSpec) -> None: + filename = spec.resolve_filename(client=self._client) + path = self._working_dir / filename + if path.exists(): + logger.info(f"Removing {path}") + path.unlink() + else: + logger.debug(f"Resource {path} does not exist") + + def clean_dataset(self, spec: DatasetSpec) -> None: + path = self._working_dir / spec.filename + if path.exists(): + logger.info(f"Removing {path}") + path.unlink() + else: + logger.debug(f"Dataset {path} does not exist") + + +def prepare_folder( + inputs_yaml: Path, + target_folder: Path | None, + client: Bfabric, + ssh_user: str | None, + action: Literal["prepare", "clean"] = "prepare", +) -> None: + """Prepares the input files of a chunk folder according to the provided specs. + + :param inputs_yaml: Path to the inputs.yml file. + :param target_folder: Path to the target folder where the input files should be downloaded. + :param client: Bfabric client to use for obtaining metadata about the input files. + :param ssh_user: SSH user to use for downloading the input files, should it be different from the current user. + :param action: Action to perform. + """ + # set defaults + inputs_yaml = inputs_yaml.absolute() + if target_folder is None: + target_folder = inputs_yaml.parent + + # parse the specs + specs_list = InputsSpec.read_yaml(inputs_yaml) + + # prepare the folder + prepare = PrepareInputs(client=client, working_dir=target_folder, ssh_user=ssh_user) + if action == "prepare": + prepare.prepare_all(specs=specs_list) + elif action == "clean": + prepare.clean_all(specs=specs_list) + else: + raise ValueError(f"Unknown action: {action}") diff --git a/src/bfabric/experimental/app_interface/output_registration/__init__.py b/app_runner/src/app_runner/output_registration/__init__.py similarity index 56% rename from src/bfabric/experimental/app_interface/output_registration/__init__.py rename to app_runner/src/app_runner/output_registration/__init__.py index 3cbba34e..1962bf10 100644 --- a/src/bfabric/experimental/app_interface/output_registration/__init__.py +++ b/app_runner/src/app_runner/output_registration/__init__.py @@ -1,3 +1,3 @@ from .register import register_outputs -__ALL__ = ["register_outputs"] +__all__ = ["register_outputs"] diff --git a/src/bfabric/experimental/app_interface/output_registration/register.py b/app_runner/src/app_runner/output_registration/register.py similarity index 76% rename from src/bfabric/experimental/app_interface/output_registration/register.py rename to app_runner/src/app_runner/output_registration/register.py index 8816dff2..a14a8bb2 100644 --- a/src/bfabric/experimental/app_interface/output_registration/register.py +++ b/app_runner/src/app_runner/output_registration/register.py @@ -1,21 +1,25 @@ from __future__ import annotations -from pathlib import Path from loguru import logger -from bfabric import Bfabric from bfabric.entities import Storage, Workunit -from bfabric.experimental.app_interface.output_registration._spec import ( +from app_runner.specs.outputs_spec import ( CopyResourceSpec, UpdateExisting, OutputsSpec, SpecType, SaveDatasetSpec, ) -from bfabric.experimental.app_interface.util.checksums import md5sum -from bfabric.experimental.app_interface.util.scp import scp -from bfabric.scripts.bfabric_save_csv2dataset import bfabric_save_csv2dataset +from app_runner.util.checksums import md5sum +from app_runner.util.scp import scp +from bfabric_scripts.bfabric_save_csv2dataset import bfabric_save_csv2dataset +from glom import glom +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from pathlib import Path + from bfabric import Bfabric def _get_output_folder(spec: CopyResourceSpec, workunit: Workunit) -> Path: @@ -31,7 +35,8 @@ def register_file_in_workunit( workunit: Workunit, storage: Storage, resource_id: int | None = None, -): +) -> None: + """Registers a file in the workunit.""" if spec.update_existing != UpdateExisting.NO: # TODO implement this functionality raise NotImplementedError("Update existing not implemented") @@ -51,14 +56,18 @@ def register_file_in_workunit( client.save("resource", resource_data) -def copy_file_to_storage(spec: CopyResourceSpec, workunit: Workunit, storage: Storage, ssh_user: str | None): +def copy_file_to_storage(spec: CopyResourceSpec, workunit: Workunit, storage: Storage, ssh_user: str | None) -> None: + """Copies a file to the storage, according to the spec.""" output_folder = _get_output_folder(spec, workunit=workunit) output_uri = f"{storage.scp_prefix}{output_folder / spec.store_entry_path}" scp(spec.local_path, output_uri, user=ssh_user) -def _save_dataset(spec: SaveDatasetSpec, client: Bfabric, workunit: Workunit): +def _save_dataset(spec: SaveDatasetSpec, client: Bfabric, workunit: Workunit) -> None: + """Saves a dataset to the bfabric.""" # TODO should not print to stdout in the future + # TODO also it should not be imported from bfabric_scripts, but rather the generic functionality should be available + # in the main package bfabric_save_csv2dataset( client=client, csv_file=spec.local_path, @@ -72,6 +81,7 @@ def _save_dataset(spec: SaveDatasetSpec, client: Bfabric, workunit: Workunit): def find_default_resource_id(workunit: Workunit) -> int | None: + """Finds the default resource's id for the workunit. Maybe in the future, this will be always `None`.""" candidate_resources = [ resource for resource in workunit.resources if resource["name"] not in ["slurm_stdout", "slurm_stderr"] ] @@ -83,12 +93,13 @@ def find_default_resource_id(workunit: Workunit) -> int | None: def register_all( client: Bfabric, workunit: Workunit, specs_list: list[SpecType], ssh_user: str | None, reuse_default_resource: bool -): +) -> None: + """Registers all the output specs to the workunit.""" default_resource_was_reused = not reuse_default_resource for spec in specs_list: logger.debug(f"Registering {spec}") if isinstance(spec, CopyResourceSpec): - storage = workunit.application.storage + storage = glom(workunit, "application.storage") copy_file_to_storage(spec, workunit=workunit, storage=storage, ssh_user=ssh_user) if not default_resource_was_reused: resource_id = find_default_resource_id(workunit=workunit) @@ -109,11 +120,16 @@ def register_outputs( ssh_user: str | None, reuse_default_resource: bool, ) -> None: + """Registers outputs to the workunit.""" + # TODO it seems there is some redundancy here (i.e. there is also the implementation in runner) # parse the specs specs_list = OutputsSpec.read_yaml(outputs_yaml) # register all specs workunit = Workunit.find(id=workunit_id, client=client) + if workunit is None: + msg = f"Workunit with id {workunit_id} not found" + raise ValueError(msg) register_all( client=client, workunit=workunit, diff --git a/src/bfabric/experimental/app_interface/workunit/__init__.py b/app_runner/src/app_runner/py.typed similarity index 100% rename from src/bfabric/experimental/app_interface/workunit/__init__.py rename to app_runner/src/app_runner/py.typed diff --git a/app_runner/src/app_runner/specs/__init__.py b/app_runner/src/app_runner/specs/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/bfabric/experimental/app_interface/app_runner/_spec.py b/app_runner/src/app_runner/specs/app_spec.py similarity index 55% rename from src/bfabric/experimental/app_interface/app_runner/_spec.py rename to app_runner/src/app_runner/specs/app_spec.py index e8312678..08bc56c7 100644 --- a/src/bfabric/experimental/app_interface/app_runner/_spec.py +++ b/app_runner/src/app_runner/specs/app_spec.py @@ -3,15 +3,17 @@ import os import shlex from pathlib import Path -from typing import Literal, Annotated, Union +from typing import Literal, Annotated -from pydantic import BaseModel, Discriminator +from pydantic import BaseModel, Discriminator, ConfigDict # TODO: This is kept very simple for now, so that it could be easily extended in the future. class CommandShell(BaseModel): + model_config = ConfigDict(extra="forbid") + type: Literal["shell"] = "shell" command: str @@ -20,30 +22,44 @@ def to_shell(self) -> list[str]: class MountOptions(BaseModel): - work_dir_target: Path = "/work" + model_config = ConfigDict(extra="forbid") + + work_dir_target: Path | None = None read_only: list[tuple[Path, Path]] = [] + writeable: list[tuple[Path, Path]] = [] share_bfabric_config: bool = True - def collect(self, work_dir: Path): + def collect(self, work_dir: Path) -> list[tuple[Path, Path, bool]]: mounts = [] if self.share_bfabric_config: mounts.append((Path("~/.bfabricpy.yml"), Path("/home/user/.bfabricpy.yml"), True)) - mounts.append((work_dir, self.work_dir_target, False)) + # TODO reconsider if we ever want work_dir_target to be customizable to be different from host path + # (currently things will break down if this is configured) + work_dir_target = work_dir if self.work_dir_target is None else self.work_dir_target + mounts.append((work_dir, work_dir_target, False)) for source, target in self.read_only: mounts.append((source, target, True)) + for source, target in self.writeable: + mounts.append((source, target, False)) return [(source.expanduser().absolute(), target, read_only) for source, target, read_only in mounts] class CommandDocker(BaseModel): + model_config = ConfigDict(extra="forbid") + # TODO not sure if to call this "docker", since "docker-compatible" would be appropriate type: Literal["docker"] = "docker" image: str command: str + entrypoint: str | None = None engine: str = "docker" + env: dict[str, str] = {} + mac_address: str | None = None mounts: MountOptions = MountOptions() + custom_args: list[str] = [] def to_shell(self, work_dir: Path | None = None) -> list[str]: - work_dir = (work_dir or Path(".")).expanduser().absolute() + work_dir = (work_dir or Path()).expanduser().absolute() mounts = self.mounts.collect(work_dir=work_dir) mount_args = [] for host, container, read_only in mounts: @@ -51,6 +67,13 @@ def to_shell(self, work_dir: Path | None = None) -> list[str]: target = shlex.quote(str(container)) mount_args.append("--mount") mount_args.append(f"type=bind,source={source},target={target}" + (",readonly" if read_only else "")) + entrypoint_arg = ["--entrypoint", self.entrypoint] if self.entrypoint else [] + env_args = [] + for key, value in self.env.items(): + env_args.append("--env") + env_args.append(f"{key}={shlex.quote(value)}") + mac_address_arg = ["--mac-address", self.mac_address] if self.mac_address else [] + return [ self.engine, "run", @@ -58,21 +81,29 @@ def to_shell(self, work_dir: Path | None = None) -> list[str]: f"{os.getuid()}:{os.getgid()}", "--rm", *mount_args, + *entrypoint_arg, + *env_args, + *mac_address_arg, + *self.custom_args, self.image, *shlex.split(self.command), ] -Command = Annotated[Union[CommandShell, CommandDocker], Discriminator("type")] +Command = Annotated[CommandShell | CommandDocker, Discriminator("type")] class CommandsSpec(BaseModel): + model_config = ConfigDict(extra="forbid") + dispatch: Command process: Command collect: Command class AppSpec(BaseModel): + model_config = ConfigDict(extra="forbid") + commands: CommandsSpec # Note: While we use the old submitter, this is still necessary reuse_default_resource: bool = True diff --git a/src/bfabric/experimental/app_interface/input_preparation/_spec.py b/app_runner/src/app_runner/specs/inputs_spec.py similarity index 60% rename from src/bfabric/experimental/app_interface/input_preparation/_spec.py rename to app_runner/src/app_runner/specs/inputs_spec.py index 0a546648..d5e1c0f4 100644 --- a/src/bfabric/experimental/app_interface/input_preparation/_spec.py +++ b/app_runner/src/app_runner/specs/inputs_spec.py @@ -1,22 +1,39 @@ from __future__ import annotations -from pathlib import Path -from typing import Annotated, Literal, Union +from typing import Annotated, Literal, TYPE_CHECKING import yaml from pydantic import BaseModel, ConfigDict, Field, Discriminator +from bfabric.entities import Resource + # ":" are not allowed, as well as absolute paths (starting with "/") RelativeFilePath = Annotated[str, Field(pattern=r"^[^/][^:]*$")] +if TYPE_CHECKING: + from pathlib import Path + from bfabric.bfabric import Bfabric + class ResourceSpec(BaseModel): model_config = ConfigDict(extra="forbid") type: Literal["bfabric_resource"] = "bfabric_resource" id: int + """B-Fabric resource ID""" + filename: RelativeFilePath | None = None + """Target filename to save to""" + check_checksum: bool = True + """Whether to check the checksum of the file, after downloading""" + + def resolve_filename(self, client: Bfabric) -> str: + if self.filename: + return self.filename + else: + resource = Resource.find(id=self.id, client=client) + return resource["name"] class DatasetSpec(BaseModel): @@ -24,13 +41,22 @@ class DatasetSpec(BaseModel): type: Literal["bfabric_dataset"] = "bfabric_dataset" id: int + """B-Fabric dataset ID""" + filename: RelativeFilePath + """Target filename to save to""" + separator: Literal[",", "\t"] = "," + """Separator for the CSV file""" + # has_header: bool # invalid_characters: str = "" + def resolve_filename(self, client: Bfabric) -> str: + return self.filename + -InputSpecType = Annotated[Union[ResourceSpec, DatasetSpec], Discriminator("type")] +InputSpecType = Annotated[ResourceSpec | DatasetSpec, Discriminator("type")] class InputsSpec(BaseModel): diff --git a/src/bfabric/experimental/app_interface/output_registration/_spec.py b/app_runner/src/app_runner/specs/outputs_spec.py similarity index 85% rename from src/bfabric/experimental/app_interface/output_registration/_spec.py rename to app_runner/src/app_runner/specs/outputs_spec.py index bffb1f36..8660e6fa 100644 --- a/src/bfabric/experimental/app_interface/output_registration/_spec.py +++ b/app_runner/src/app_runner/specs/outputs_spec.py @@ -1,8 +1,8 @@ from __future__ import annotations import enum -from pathlib import Path -from typing import Literal, Union, Annotated +from pathlib import Path # noqa: TCH003 +from typing import Literal, Annotated import yaml from pydantic import BaseModel, ConfigDict, Field @@ -33,14 +33,16 @@ class CopyResourceSpec(BaseModel): class SaveDatasetSpec(BaseModel): model_config = ConfigDict(extra="forbid") - type: Literal["bfabric_save_dataset"] = "bfabric_dataset" + type: Literal["bfabric_dataset"] = "bfabric_dataset" local_path: Path separator: str name: str | None = None + has_header: bool = True + invalid_characters: str = "" -SpecType = Union[CopyResourceSpec, SaveDatasetSpec] +SpecType = CopyResourceSpec | SaveDatasetSpec class OutputsSpec(BaseModel): diff --git a/app_runner/src/app_runner/util/__init__.py b/app_runner/src/app_runner/util/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/bfabric/experimental/app_interface/util/checksums.py b/app_runner/src/app_runner/util/checksums.py similarity index 79% rename from src/bfabric/experimental/app_interface/util/checksums.py rename to app_runner/src/app_runner/util/checksums.py index e830f41f..e683af0a 100644 --- a/src/bfabric/experimental/app_interface/util/checksums.py +++ b/app_runner/src/app_runner/util/checksums.py @@ -1,7 +1,10 @@ from __future__ import annotations import hashlib -from pathlib import Path +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from pathlib import Path def md5sum(file: Path) -> str: diff --git a/src/bfabric/experimental/app_interface/util/scp.py b/app_runner/src/app_runner/util/scp.py similarity index 97% rename from src/bfabric/experimental/app_interface/util/scp.py rename to app_runner/src/app_runner/util/scp.py index 34624239..117d8a5d 100644 --- a/src/bfabric/experimental/app_interface/util/scp.py +++ b/app_runner/src/app_runner/util/scp.py @@ -31,7 +31,7 @@ def scp(source: str | Path, target: str | Path, *, user: str | None = None, mkdi if mkdir: if target_remote: host, path = target.split(":", 1) - parent_path = str(Path(path).parent) + parent_path = Path(path).parent logger.debug(f"ssh {host} mkdir -p {parent_path}") subprocess.run(["ssh", host, "mkdir", "-p", parent_path], check=True) else: diff --git a/docs/changelog.md b/docs/changelog.md index 580e3aad..e08d67ad 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -10,6 +10,32 @@ Versioning currently follows `X.Y.Z` where ## \[Unreleased\] +## \[1.13.9\] - 2024-12-10 + +From this release onwards, the experimental app runner is not part of the main bfabric package and +instead a separate Python package with its individual changelog. + +### Added + +- Relationship: `ExternalJob.executable` +- (experimental) EntityLookupCache that allows to cache entity lookups in a script to avoid redundant requests. +- Specific use case script: bfabric_save_resource_description.py (the functionality will be available in a future CLI). + +### Fixed + +- `Entity.find_all` returns no values when an empty list is passed as an argument. + +### Changed + +- Except for macOS x86_64 (which we assume is Rosetta emulation nowadays), we use the faster `polars` instead of `polars-lts-cpu`. +- `BfabricRequestError` is now a `RuntimeError` subclass. +- Add `py.typed` marker. + +### Removed + +- `bfabric_legacy.py` has been removed. +- `math_helper.py` has been removed. + ## \[1.13.8\] - 2024-10-03 This release contains mainly internal changes and ongoing development on the experimental app interface functionality. diff --git a/docs/contribute.md b/docs/contribute.md index 5cdce50a..5ebf9045 100644 --- a/docs/contribute.md +++ b/docs/contribute.md @@ -33,11 +33,10 @@ mkdocs gh-deploy ## Release To create a release, increase the version in `pyproject.toml`, prepare `changelog.md`, -commit everything and run `release.py`. This script will +commit everything and create a PR to the `stable` branch. -- Create a tag and push it to GitHub -- Merge the changes to the `stable` branch and push it to GitHub -- Build the documentation and push it to GitHub Pages +Once this is merged a Github Action will create a tag (if the tag already exists, it will fail!) and the documentation +will be rebuilt and published to GitHub Pages. The only manual step that remains is creating a release on GitHub. To do so, you can paste the changelog section of the release and create a new release on GitHub using the tag that was created. diff --git a/docs/index.md b/docs/index.md index 92983fc7..ac58235e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,8 +1,13 @@ # Home -This package connects the [bfabric](https://fgcz-bfabric.uzh.ch/bfabric/) system to the [python](https://www.python.org/) and [R](https://cran.r-project.org/) world while providing a JSON and REST interface using [Flask](https://www.fullstackpython.com). -The [bfabricShiny](https://github.com/cpanse/bfabricShiny) R package is an extension and provides code snippets and sample implementation for a seamless R shiny bfabric integration. -For more advanced users the *bfabricPy* package also provides a powerful query interface on the command-line though using the provided scripts. +This package implements a Python interface to the [B-Fabric](https://fgcz-bfabric.uzh.ch/bfabric/) system. +Several pieces of functionality are available: + +- Python API: + - General client for all B-Fabric web service operations (CRUD) and configuration management. + - A relational API for low-boilerplate read access to the B-Fabric system. +- Scripts: Several scripts we use more or less frequently to interact with the system. +- A REST API: A REST API to interact with the B-Fabric system. This allows us to interact with B-Fabric from R using [bfabricShiny](https://github.com/cpanse/bfabricShiny). Please see below for how to install bfabricPy. diff --git a/docs/old/cheatsheet.md b/docs/old/cheatsheet.md new file mode 100644 index 00000000..9bd33e47 --- /dev/null +++ b/docs/old/cheatsheet.md @@ -0,0 +1,255 @@ +The contents of this page are not up-to-date and will be integrated into the documentation or removed. + +### Read + +```{bash} +bfabric_read.py storage +bfabric_read.py application +``` + +Simple database query examples + +```{bash} +bfabric_read.py user login cpanse +bfabric_read.py project id 3000 +bfabric_read.py workunit id 199387 +bfabric_read.py sample name autoQC4L +bfabric_read.py workunit status processing +bfabric_read.py workunit status pending +bfabric_read.py workunit status failed + +# list empty resources +bfabric_read.py resource filechecksum d41d8cd98f00b204e9800998ecf8427e +``` + +Using the Python API: + +```{py} +from bfabric import Bfabric + +client = Bfabric.from_config() + +user = client.read(endpoint = 'user', obj={'login': 'cpanse'}) +resource = client.read(endpoint = 'resource', obj={'id': 550327 }) +``` + +### save + +```{bash} +bfabric_save_workunit_attribute.py 199387 status available +``` + +```{python} +import json +rv = client.save('workunit', {'id': 254063, 'status': 'available'}) +print(json.dumps(rv.to_list_dict(), indent=2)) +``` + +### Command line code snippet + +Find empty resource files in bfabric + +```{bash} +bfabric_read.py resource filechecksum `md5sum < /dev/null | cut -c-32` \ + | cat -n \ + | tail +``` + +## Examples \[outdated\] + +### bash script generated by the yaml wrapper creator / submitter + +externaljobid-45939_executableid-15312.bash listing: + +```bash +#!/bin/bash +# +# $HeadURL: http://fgcz-svn.uzh.ch/repos/scripts/trunk/linux/bfabric/apps/python/README.md $ +# $Id: README.md 2535 2016-10-24 08:49:17Z cpanse $ +# Christian Panse 2007-2015 + +# Grid Engine Parameters +#$ -q PRX@fgcz-c-071 +#$ -e /home/bfabric/sgeworker/logs/workunitid-134923_resourceid-203236.err +#$ -o /home/bfabric/sgeworker/logs/workunitid-134923_resourceid-203236.out + + +set -e +set -o pipefail + +export EXTERNALJOBID=45938 +export RESSOURCEID_OUTPUT=203238 +export RESSOURCEID_STDOUT_STDERR="203237 203238" +export OUTPUT="bfabric@fgczdata.fgcz-net.unizh.ch:/srv/www/htdocs//p1000/bfabric/Proteomics/gerneric_yaml/2015/2015-09/2015-09-02//workunit_134923//203236.zip" + +# job configuration set by B-Fabrics wrapper_creator executable +_OUTPUT=`echo $OUTPUT | cut -d"," -f1` +test $? -eq 0 && _OUTPUTHOST=`echo $_OUTPUT | cut -d":" -f1` +test $? -eq 0 && _OUTPUTPATH=`echo $_OUTPUT | cut -d":" -f2` +test $? -eq 0 && _OUTPUTPATH=`dirname $_OUTPUTPATH` +test $? -eq 0 && ssh $_OUTPUTHOST "mkdir -p $_OUTPUTPATH" + +if [ $? -eq 1 ]; +then + echo "writting to output url failed!"; + exit 1; +fi + +cat > /tmp/yaml_config.$$ < + + + + + XXX + XXX + + 482 + + + + + ' +} + +for url in https://fgcz-bfabric.uzh.ch/bfabric/user?wsdl https://fgcz-bfabric-test.uzh.ch/bfabric/user?wsdl; +do + echo + echo "==== ${url} === " + query ${url} +done + +echo $? +``` + +### Example usage + +remove accidentally inserted mgf files + +``` +bfabric_read.py importresource \ + | grep mgf$ \ + | awk '{print $1}' \ + | tee /tmp/$$.log \ + | while read i; + do + bfabric_delete.py importresource $i ; + done +``` + +## Send an E-mail \[outdated\] + +``` +# by CT,CP +# not implemented yet 2022-10-19 , +rv = B.save_object(endpoint = 'mail', + obj={'subject': "TEST", + 'recipientemail': 'bfabrictest482.cp@fgcz.ethz.ch', + 'message': "TEST; ignore that email", + 'parentId': 482, + 'parentClassName': 'user'}) +# shown as mail for user id 482 +``` + +## See also + +- [bfabric documentation](https://fgcz-bfabric.uzh.ch/wiki/HomePage) +- [FAQ](faq.md) +- [wsdl4BFabric](http://fgcz-intranet.uzh.ch/tiki-index.php?page=wsdl4BFabric) wiki page +- WSDL Interface to B-Fabric [endpoints](http://fgcz-bfabric.uzh.ch/bfabric/workunit?wsdl) + +## FAQ + +### How to resolve `= 3.0.3", "rich >= 13.7.1", "zeep >= 4.2.1", - "polars-lts-cpu >= 0.20.25", + "polars-lts-cpu >= 0.20.25; platform_machine == 'x86_64' and platform_system == 'Darwin'", + "polars >= 0.20.25; platform_machine != 'x86_64' or platform_system != 'Darwin'", "loguru>=0.7", - "setuptools", - "pydantic", + "pydantic>=2.9.2", "eval_type_backport; python_version < '3.10'", "python-dateutil >= 2.9.0", - "cyclopts", + "cyclopts >= 2.9.9", #"platformdirs >= 4.3", ] [project.optional-dependencies] dev = [ + "bfabric[doc,test]", "black", "isort", "ruff", "licensecheck", - "pytest", - "pytest-mock", - "logot", "nox", "uv", - "mkdocs", - "mkdocs-material", - "mkdocstrings[python]", ] +doc = ["mkdocs", "mkdocs-material", "mkdocstrings[python]"] +test = ["pytest", "pytest-mock", "logot"] +typing = ["mypy", "types-requests", "lxml-stubs", "pandas-stubs", "types-python-dateutil"] [project.urls] Homepage = "https://github.com/fgcz/bfabricPy" Repository = "https://github.com/fgcz/bfabricPy" [project.scripts] -"bfabric_flask.py"="bfabric_scripts.bfabric_flask:main" +"bfabric_flask.py" = "bfabric_scripts.bfabric_flask:main" #bfabric_feeder_resource_autoQC="bfabric_scripts.bfabric_feeder_resource_autoQC:main" -"bfabric_list_not_existing_storage_directories.py"="bfabric_scripts.bfabric_list_not_existing_storage_directories:main" -"bfabric_list_not_available_proteomics_workunits.py"="bfabric_scripts.bfabric_list_not_available_proteomics_workunits:main" -"bfabric_list_workunit_parameters.py"="bfabric_scripts.bfabric_list_workunit_parameters:main" -"bfabric_upload_resource.py"="bfabric_scripts.bfabric_upload_resource:main" -"bfabric_logthis.py"="bfabric_scripts.bfabric_logthis:main" -"bfabric_setResourceStatus_available.py"="bfabric_scripts.bfabric_setResourceStatus_available:main" -"bfabric_setExternalJobStatus_done.py"="bfabric_scripts.bfabric_setExternalJobStatus_done:main" -"bfabric_setWorkunitStatus_available.py"="bfabric_scripts.bfabric_setWorkunitStatus:main_available" -"bfabric_setWorkunitStatus_processing.py"="bfabric_scripts.bfabric_setWorkunitStatus:main_processing" -"bfabric_setWorkunitStatus_failed.py"="bfabric_scripts.bfabric_setWorkunitStatus:main_failed" -"bfabric_delete.py"="bfabric_scripts.bfabric_delete:main" -"bfabric_read.py"="bfabric_scripts.bfabric_read:main" -"bfabric_read_samples_of_workunit.py"="bfabric_scripts.bfabric_read_samples_of_workunit:main" -"bfabric_read_samples_from_dataset.py"="bfabric_scripts.bfabric_read_samples_from_dataset:main" -"bfabric_save_csv2dataset.py"="bfabric_scripts.bfabric_save_csv2dataset:main" -"bfabric_save_dataset2csv.py"="bfabric_scripts.bfabric_save_dataset2csv:main" -"bfabric_save_fasta.py"="bfabric_scripts.bfabric_save_fasta:main" -"bfabric_save_importresource_sample.py"="bfabric_scripts.bfabric_save_importresource_sample:main" -"bfabric_save_link_to_workunit.py"="bfabric_scripts.bfabric_save_link_to_workunit:main" +"bfabric_list_not_existing_storage_directories.py" = "bfabric_scripts.bfabric_list_not_existing_storage_directories:main" +"bfabric_list_not_available_proteomics_workunits.py" = "bfabric_scripts.bfabric_list_not_available_proteomics_workunits:main" +"bfabric_list_workunit_parameters.py" = "bfabric_scripts.bfabric_list_workunit_parameters:main" +"bfabric_upload_resource.py" = "bfabric_scripts.bfabric_upload_resource:main" +"bfabric_logthis.py" = "bfabric_scripts.bfabric_logthis:main" +"bfabric_setResourceStatus_available.py" = "bfabric_scripts.bfabric_setResourceStatus_available:main" +"bfabric_setExternalJobStatus_done.py" = "bfabric_scripts.bfabric_setExternalJobStatus_done:main" +"bfabric_setWorkunitStatus_available.py" = "bfabric_scripts.bfabric_setWorkunitStatus:main_available" +"bfabric_setWorkunitStatus_processing.py" = "bfabric_scripts.bfabric_setWorkunitStatus:main_processing" +"bfabric_setWorkunitStatus_failed.py" = "bfabric_scripts.bfabric_setWorkunitStatus:main_failed" +"bfabric_delete.py" = "bfabric_scripts.bfabric_delete:main" +"bfabric_read.py" = "bfabric_scripts.bfabric_read:main" +"bfabric_read_samples_of_workunit.py" = "bfabric_scripts.bfabric_read_samples_of_workunit:main" +"bfabric_read_samples_from_dataset.py" = "bfabric_scripts.bfabric_read_samples_from_dataset:main" +"bfabric_save_csv2dataset.py" = "bfabric_scripts.bfabric_save_csv2dataset:main" +"bfabric_save_dataset2csv.py" = "bfabric_scripts.bfabric_save_dataset2csv:main" +"bfabric_save_fasta.py" = "bfabric_scripts.bfabric_save_fasta:main" +"bfabric_save_importresource_sample.py" = "bfabric_scripts.bfabric_save_importresource_sample:main" +"bfabric_save_link_to_workunit.py" = "bfabric_scripts.bfabric_save_link_to_workunit:main" #bfabric_save_resource="bfabric_scripts.bfabric_save_resource:main" -"bfabric_save_workunit_attribute.py"="bfabric_scripts.bfabric_save_workunit_attribute:main" -"bfabric_save_workflowstep.py"="bfabric_scripts.bfabric_save_workflowstep:main" -"bfabric_slurm_queue_status.py"="bfabric_scripts.bfabric_slurm_queue_status:main" +"bfabric_save_workunit_attribute.py" = "bfabric_scripts.bfabric_save_workunit_attribute:main" +"bfabric_save_workflowstep.py" = "bfabric_scripts.bfabric_save_workflowstep:main" +"bfabric_slurm_queue_status.py" = "bfabric_scripts.bfabric_slurm_queue_status:main" +"bfabric_save_resource_description.py" = "bfabric_scripts.bfabric_save_resource_description:main" + +[tool.setuptools.package-data] +"*" = ["py.typed"] + +[tool.uv] +reinstall-package = ["bfabric", "bfabric_scripts"] [tool.black] line-length = 120 @@ -89,9 +94,15 @@ indent-width = 4 target-version = "py39" [tool.ruff.lint] -#select = ["ANN", "BLE", "D103", "E", "F", "PLW", "PTH", "SIM", "UP", "TCH", "N"] -select = ["PLW", "SIM", "UP", "EXE", "E701", "E702", "E703", "E711", "E713", "E714", "FA100", "FA102", "W191"] -ignore = ["ANN101", "ANN102"] +select = ["ANN", "BLE", "D103", "E", "EXE", "F", "N", "PLW", "PTH", "SIM", "TCH", "UP", "W191"] +ignore = ["ANN101", "ANN102", "ANN401"] + +[tool.ruff.lint.per-file-ignores] +"**/bfabric_scripts/**" = ["ALL"] +"**/wrapper_creator/**" = ["ALL"] +"**/examples/**" = ["ALL"] +"**/tests/**" = ["ALL"] +"noxfile.py" = ["ALL"] [tool.licensecheck] using = "PEP631" diff --git a/release.py b/release.py deleted file mode 100644 index 28e666fb..00000000 --- a/release.py +++ /dev/null @@ -1,82 +0,0 @@ -import subprocess -import re -import tomllib -import sys -from pathlib import Path - - -def get_remote_tags() -> list[str]: - # executes: git ls-remote --tags origin - out = subprocess.run(["git", "ls-remote", "--tags", "origin"], check=True, capture_output=True) - return [line.split("\t")[-1].split("/")[-1] for line in out.stdout.decode().split("\n") if line] - - -def get_local_tags() -> list[str]: - # executes: git tag - out = subprocess.run(["git", "tag"], check=True, capture_output=True) - return out.stdout.decode().split("\n") - - -def get_existing_releases(remote: bool) -> list[str]: - tags = get_remote_tags() if remote else get_local_tags() - - # e.g. 1.2.21 - pattern = re.compile(r"^\d+\.\d+\.\d+$") - return [tag for tag in tags if pattern.match(tag)] - - -def get_most_recent_release(remote: bool) -> str: - sorted_releases = sorted(get_existing_releases(remote=remote), key=lambda x: tuple(map(int, x.split(".")))) - return sorted_releases[-1] - - -def get_current_pyproject_toml_version() -> str: - pyproject_toml_path = Path("pyproject.toml") - pyproject_toml = tomllib.loads(pyproject_toml_path.read_text()) - return pyproject_toml["project"]["version"] - - -def check_version() -> str: - released_remote = get_most_recent_release(remote=True) - released_local = get_most_recent_release(remote=False) - current = get_current_pyproject_toml_version() - if released_remote == current: - print(f"Version {current} is already released remotely. Please bump the version in pyproject.toml") - sys.exit(1) - elif released_local == current: - print(f"Version {current} is already released locally. Please bump the version in pyproject.toml") - sys.exit(1) - else: - return current - - -def checkout_branch(branch: str) -> None: - subprocess.run(["git", "checkout", branch], check=True) - - -def create_and_push_tag(version: str) -> None: - subprocess.run(["git", "tag", version], check=True) - subprocess.run(["git", "push", "origin", version], check=True) - - -def merge_and_push_current_branch(branch: str) -> None: - subprocess.run(["git", "merge", branch], check=True) - subprocess.run(["git", "push", "origin"], check=True) - - -def publish_docs() -> None: - subprocess.run(["mkdocs", "gh-deploy"], check=True) - - -def main() -> None: - checkout_branch("main") - version = check_version() - create_and_push_tag(version) - checkout_branch("stable") - merge_and_push_current_branch("main") - checkout_branch("main") - publish_docs() - - -if __name__ == "__main__": - main() diff --git a/src/bfabric/__init__.py b/src/bfabric/__init__.py index 8970b7fb..fee7836f 100644 --- a/src/bfabric/__init__.py +++ b/src/bfabric/__init__.py @@ -1,7 +1,7 @@ import importlib.metadata from bfabric.bfabric import Bfabric, BfabricAPIEngineType -from bfabric.config import BfabricAuth, BfabricClientConfig +from bfabric.config.bfabric_auth import BfabricAuth from bfabric.config.bfabric_client_config import BfabricClientConfig __all__ = [ @@ -11,5 +11,4 @@ "BfabricClientConfig", ] - __version__ = importlib.metadata.version("bfabric") diff --git a/src/bfabric/bfabric.py b/src/bfabric/bfabric.py index 72693604..6aa2ac8e 100644 --- a/src/bfabric/bfabric.py +++ b/src/bfabric/bfabric.py @@ -16,14 +16,13 @@ import base64 import importlib.metadata import sys -from contextlib import AbstractContextManager from contextlib import contextmanager from datetime import datetime from enum import Enum from functools import cached_property from pathlib import Path from pprint import pprint -from typing import Literal, Any +from typing import Literal, Any, TYPE_CHECKING from loguru import logger from rich.console import Console @@ -37,6 +36,9 @@ from bfabric.results.result_container import ResultContainer from bfabric.utils.paginator import compute_requested_pages, BFABRIC_QUERY_LIMIT +if TYPE_CHECKING: + from collections.abc import Generator + class BfabricAPIEngineType(Enum): """Choice of engine to use.""" @@ -112,7 +114,7 @@ def auth(self) -> BfabricAuth: return self._auth @contextmanager - def with_auth(self, auth: BfabricAuth) -> AbstractContextManager[Bfabric]: + def with_auth(self, auth: BfabricAuth) -> Generator[None, None, None]: """Context manager that temporarily (within the scope of the context) sets the authentication for the Bfabric object to the provided value. This is useful when authenticating multiple users, to avoid accidental use of the wrong credentials. @@ -167,7 +169,7 @@ def read( logger.debug(f"Requested pages: {requested_pages}") # NOTE: Page numbering starts at 1 - response_items = [] + response_items: list[dict[str, Any]] = [] errors = results.errors page_offset = initial_offset for i_iter, i_page in enumerate(requested_pages): @@ -277,7 +279,7 @@ def __repr__(self) -> str: __str__ = __repr__ - def __getstate__(self): + def __getstate__(self) -> dict[str, Any]: return { "config": self._config, "auth": self._auth, @@ -285,7 +287,7 @@ def __getstate__(self): "query_counter": self.query_counter, } - def __setstate__(self, state): + def __setstate__(self, state: dict[str, Any]) -> None: self._config = state["config"] self._auth = state["auth"] self._engine_type = state["engine_type"] @@ -293,14 +295,14 @@ def __setstate__(self, state): def get_system_auth( - login: str = None, - password: str = None, - base_url: str = None, - config_path: str = None, - config_env: str = None, + login: str | None = None, + password: str | None = None, + base_url: str | None = None, + config_path: str | None = None, + config_env: str | None = None, optional_auth: bool = True, verbose: bool = False, -) -> tuple[BfabricClientConfig, BfabricAuth]: +) -> tuple[BfabricClientConfig, BfabricAuth | None]: """ :param login: Login string for overriding config file :param password: Password for overriding config file @@ -312,23 +314,21 @@ def get_system_auth( otherwise an exception will be raised :param verbose: Verbosity (TODO: resolve potential redundancy with logger) """ - - have_config_path = config_path is not None - config_path = Path(config_path or "~/.bfabricpy.yml").expanduser() + resolved_path = Path(config_path or "~/.bfabricpy.yml").expanduser() # Use the provided config data from arguments instead of the file - if not config_path.is_file(): - if have_config_path: + if not resolved_path.is_file(): + if config_path: # NOTE: If user explicitly specifies a path to a wrong config file, this has to be an exception - raise OSError(f"Explicitly specified config file does not exist: {config_path}") + raise OSError(f"Explicitly specified config file does not exist: {resolved_path}") # TODO: Convert to log - print(f"Warning: could not find the config file in the default location: {config_path}") + print(f"Warning: could not find the config file in the default location: {resolved_path}") config = BfabricClientConfig(base_url=base_url) - auth = None if login is None and password is None else BfabricAuth(login=login, password=password) + auth = None if login is None or password is None else BfabricAuth(login=login, password=password) # Load config from file, override some of the fields with the provided ones else: - config, auth = read_config(config_path, config_env=config_env) + config, auth = read_config(resolved_path, config_env=config_env) config = config.copy_with(base_url=base_url) if (login is not None) and (password is not None): auth = BfabricAuth(login=login, password=password) diff --git a/src/bfabric/bfabric2.py b/src/bfabric/bfabric2.py index 869842f7..ac138da5 100755 --- a/src/bfabric/bfabric2.py +++ b/src/bfabric/bfabric2.py @@ -3,4 +3,3 @@ warnings.warn("bfabric.bfabric2 module is deprecated, use bfabric instead", DeprecationWarning) # TODO deprecated - import from bfabric instead -from bfabric.bfabric import Bfabric, BfabricAPIEngineType, get_system_auth diff --git a/src/bfabric/bfabric_config.py b/src/bfabric/bfabric_config.py index 0ccf2933..cf4ca7e9 100644 --- a/src/bfabric/bfabric_config.py +++ b/src/bfabric/bfabric_config.py @@ -12,7 +12,7 @@ def read_config( config_path: str | Path, - config_env: str = None, + config_env: str | None = None, ) -> tuple[BfabricClientConfig, BfabricAuth | None]: """ Reads bfabricpy.yml file, parses it, extracting authentication and configuration data diff --git a/src/bfabric/bfabric_legacy.py b/src/bfabric/bfabric_legacy.py deleted file mode 100644 index 673d22cd..00000000 --- a/src/bfabric/bfabric_legacy.py +++ /dev/null @@ -1,243 +0,0 @@ -from __future__ import annotations -import base64 -import json -import os -import sys -from pprint import pprint -from typing import Any - -import yaml -from suds.client import Client -from suds.wsdl import Service - -from bfabric.config import BfabricClientConfig -from bfabric.config import BfabricAuth -from bfabric.bfabric_config import read_config - - -class BfabricLegacy: - """B-Fabric python3 module - Implements read and save object methods for B-Fabric wsdl interface - """ - - def warning(self, msg) -> None: - sys.stderr.write(f"\033[93m{msg}\033[0m\n") - - def __init__( - self, - login: str = None, - password: str = None, - base_url: str = None, - externaljobid=None, - config_path: str = None, - config_env: str = None, - optional_auth: bool = False, - verbose: bool = False, - ) -> None: - """ - :param login: Login string for overriding config file - :param password: Password for overriding config file - :param base_url: Base url of the BFabric server for overriding config file - :param externaljobid: ? - :param config_path: Path to the config file, in case it is different from default - :param config_env: Which config environment to use. Can also specify via environment variable or use - default in the config file (at your own risk) - :param optional_auth: Whether authentification is optional. If yes, missing authentification will be ignored, - otherwise an exception will be raised - :param verbose: Verbosity (TODO: resolve potential redundancy with logger) - """ - self.verbose = verbose - - self.cl = {} - self.verbose = False - self.query_counter = 0 - - # Get default path config file path - config_path = config_path or os.path.normpath(os.path.expanduser("~/.bfabricpy.yml")) - - # TODO: Convert to an exception when this branch becomes main - config_path or os.path.normpath(os.path.expanduser("~/.bfabricrc.py")) - if os.path.isfile(config_path): - self.warning( - "WARNING! The old .bfabricrc.py was found in the home directory. Delete and make sure to use the new .bfabricpy.yml" - ) - - # Use the provided config data from arguments instead of the file - if not os.path.isfile(config_path): - self.warning("could not find '.bfabricpy.yml' file in home directory.") - self.config = BfabricClientConfig(base_url=base_url) - self.auth = BfabricAuth(login=login, password=password) - - # Load config from file, override some of the fields with the provided ones - else: - config, auth = read_config(config_path, config_env=config_env, optional_auth=optional_auth) - self.config = config.with_overrides(base_url=base_url) - if (login is not None) and (password is not None): - self.auth = BfabricAuth(login=login, password=password) - elif (login is None) and (password is None): - self.auth = auth - else: - raise OSError("Must provide both username and password, or neither.") - - if not self.config.base_url: - raise ValueError("base server url missing") - if not optional_auth: - if not self.auth or not self.auth.login or not self.auth.password: - raise ValueError("Authentification not initialized but required") - - msg = f"\033[93m--- base_url {self.config.base_url}; login; {self.auth.login} ---\033[0m\n" - sys.stderr.write(msg) - - if self.verbose: - pprint(self.config) - - def read_object(self, endpoint, obj, page=1, plain=False, idonly=False): - """ - A generic method which can connect to any endpoint, e.g., workunit, project, order, - externaljob, etc, and returns the object with the requested id. - obj is a python dictionary which contains all the attributes of the endpoint - for the "query". - """ - return self._perform_request( - endpoint=endpoint, method="read", plain=plain, params=dict(query=obj, idonly=idonly, page=page) - ) - - def readid_object(self, endpoint, obj, page=1, plain=False): - """ - A generic method which can connect to any endpoint, e.g., workunit, project, order, - externaljob, etc, and returns the object with the requested id. - obj is a python dictionary which contains only the id of the endpoint for the "query". - """ - return self._perform_request(endpoint=endpoint, method="readid", plain=plain, params=dict(query=obj, page=page)) - - def save_object(self, endpoint, obj, debug=None): - """ - same as read_object above but uses the save method. - """ - return self._perform_request(endpoint=endpoint, method="save", plain=debug is not None, params={endpoint: obj}) - - def checkandinsert_object(self, endpoint, obj, debug=None): - """ - wsdl method to check iff dependencies are fulfilled - """ - # TODO This method was changed a while ago to use the "save"endpoint, which makes it functionally identical - # to the save_object method. Check if this was intended. - return self._perform_request(endpoint=endpoint, method="save", plain=debug is not None, params={endpoint: obj}) - - def delete_object(self, endpoint, id=None, debug=None): - """ - same as read_object above but uses the delete method. - """ - return self._perform_request(endpoint=endpoint, method="delete", plain=debug is not None, params=dict(id=id)) - - def upload_file(self, filename, workunitid): - with open(filename, "rb") as f: - content = f.read() - - resource_base64 = base64.b64encode(content).decode() - - res = self.save_object( - "resource", - { - "base64": resource_base64, - "name": os.path.basename(filename), - "description": "base64 encoded file", - "workunitid": workunitid, - }, - ) - - return res - - def _get_service(self, endpoint: str) -> Service: - """Returns a `suds.client.Service` object for the given endpoint name.""" - if endpoint not in self.cl: - self.cl[endpoint] = Client(f"{self.config.base_url}/{endpoint}?wsdl", cache=None) - return self.cl[endpoint].service - - def _perform_request(self, endpoint: str, method: str, plain: bool, params: dict[str, Any]) -> Any: - """Performs a request to the given endpoint and returns the result.""" - self.query_counter += 1 - request_params = dict(login=self.auth.login, password=self.auth.password, **params) - service = self._get_service(endpoint=endpoint) - response = getattr(service, method)(request_params) - if plain: - return response - elif getattr(response, "entitiesonpage", None) == 0: - return [] - return getattr(response, endpoint) - - @staticmethod - def print_json(queryres=None) -> None: - """ - This method prints the query result as returned by ``read_object`` in JSON format. - - Parameter - --------- - - queryres : the object returned by ``read_object`` method. - """ - if queryres is None: - raise TypeError( - "print_json() missing 1 required positional argument: please provide the output from read_object as parameter to print_json" - ) - - res = json.dumps(queryres, cls=bfabricEncoder, sort_keys=True, indent=2) - print(res) - - @staticmethod - def print_yaml(queryres=None) -> None: - """ - This method prints the query result as returned by ``read_object`` in YAML format. - - Parameter - --------- - - queryres : the object returned by ``read_object`` method. - """ - if queryres is None: - raise TypeError( - "print_yaml() missing 1 required positional argument: please provide the output from read_object as parameter to print_yaml" - ) - - res_json = json.dumps(queryres, cls=bfabricEncoder, sort_keys=True) - res = yaml.dump(res_json, default_flow_style=False, encoding=None, default_style=None) - print(res) - - def get_sampleid(self, resourceid=None): - """ - determines the sample_id of a given resource_id. - it performs a recursive dfs. - TODO(cp): check if the method should be implemented using a stack - - :param resourceid: - :return: (int, int) - """ - - assert isinstance(resourceid, int) - - try: - resource = self.read_object("resource", obj={"id": resourceid})[0] - except: - return None - - try: - workunit = self.read_object(endpoint="workunit", obj={"id": resource.workunit._id})[0] - return self.get_sampleid(resourceid=int(workunit.inputresource[0]._id)) - except: - self.warning(f"fetching sampleid of resource.workunitid = {resource.workunit._id} failed.") - return None - - -class bfabricEncoder(json.JSONEncoder): - """ - Implements json encoder for the Bfabric.print_json method - """ - - def default(self, o): - try: - return dict(o) - except TypeError: - pass - else: - return list(o) - return JSONEncoder.default(self, o) diff --git a/src/bfabric/cli_formatting.py b/src/bfabric/cli_formatting.py index f558b86a..77156948 100644 --- a/src/bfabric/cli_formatting.py +++ b/src/bfabric/cli_formatting.py @@ -18,10 +18,15 @@ class HostnameHighlighter(RegexHighlighter): def setup_script_logging(debug: bool = False) -> None: """Sets up the logging for the command line scripts.""" + setup_flag_key = "BFABRICPY_SCRIPT_LOGGING_SETUP" + if os.environ.get(setup_flag_key, "0") == "1": + return logger.remove() + packages = ["bfabric", "bfabric_scripts", "app_runner", "__main__"] if not (debug or os.environ.get("BFABRICPY_DEBUG")): - logger.add(sys.stderr, filter="bfabric", level="INFO", format="{level} {message}") - logger.add(sys.stderr, filter="__main__", level="INFO", format="{level} {message}") + for package in packages: + logger.add(sys.stderr, filter=package, level="INFO", format="{level} {message}") else: - logger.add(sys.stderr, filter="bfabric", level="DEBUG") - logger.add(sys.stderr, filter="__main__", level="DEBUG") + for package in packages: + logger.add(sys.stderr, filter=package, level="DEBUG") + os.environ[setup_flag_key] = "1" diff --git a/src/bfabric/config/bfabric_client_config.py b/src/bfabric/config/bfabric_client_config.py index 083219a8..e7bfe72b 100644 --- a/src/bfabric/config/bfabric_client_config.py +++ b/src/bfabric/config/bfabric_client_config.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Annotated +from typing import Annotated, Any from pydantic import BaseModel, BeforeValidator, Field, TypeAdapter, AnyHttpUrl @@ -25,7 +25,8 @@ class BfabricClientConfig(BaseModel): application_ids: Annotated[dict[str, int], Field(default_factory=dict)] job_notification_emails: Annotated[str, Field(default="")] - def __init__(self, **kwargs) -> None: + def __init__(self, **kwargs: Any) -> None: + # TODO remove this custom constructor (note that this is currently used in some places when "None" is passed) super().__init__(**{key: value for key, value in kwargs.items() if value is not None}) def copy_with( diff --git a/src/bfabric/config/config_file.py b/src/bfabric/config/config_file.py index c8e86a5e..1983ba9a 100644 --- a/src/bfabric/config/config_file.py +++ b/src/bfabric/config/config_file.py @@ -1,7 +1,7 @@ from __future__ import annotations import os -from typing import Annotated +from typing import Annotated, Any from loguru import logger from pydantic import BaseModel, Field, model_validator @@ -21,14 +21,14 @@ class EnvironmentConfig(BaseModel): @model_validator(mode="before") @classmethod - def gather_config(cls, values): + def gather_config(cls, values: dict[str, Any]) -> dict[str, Any]: """Gathers all configs into the config field.""" values["config"] = {key: value for key, value in values.items() if key not in ["login", "password"]} return values @model_validator(mode="before") @classmethod - def gather_auth(cls, values): + def gather_auth(cls, values: dict[str, Any]) -> dict[str, Any]: if "login" in values: values["auth"] = BfabricAuth.model_validate(values) return values @@ -40,7 +40,7 @@ class ConfigFile(BaseModel): @model_validator(mode="before") @classmethod - def gather_configs(cls, values): + def gather_configs(cls, values: dict[str, Any]) -> dict[str, Any]: """Gathers all configs into the configs field.""" configs = {} for key, value in values.items(): diff --git a/src/bfabric/engine/response_format_suds.py b/src/bfabric/engine/response_format_suds.py index aaf64d54..eda2988b 100644 --- a/src/bfabric/engine/response_format_suds.py +++ b/src/bfabric/engine/response_format_suds.py @@ -1,10 +1,15 @@ from __future__ import annotations -from typing import Any + +from typing import Any, TYPE_CHECKING + from suds.sax.text import Text from suds.sudsobject import asdict +if TYPE_CHECKING: + Value = list["Value"] | dict[str, "Value"] | str | int | float | bool | None | Any + -def convert_suds_type(item: Any) -> int | str: +def convert_suds_type(item: Any) -> int | str | Any: """ Converts the suds type to an equivalent python type. There is, to my knowledge, only a single suds type which is currently ever return, namely 'Text'. Integers and doubles are already cast to their python equivalents and @@ -17,24 +22,25 @@ def convert_suds_type(item: Any) -> int | str: return item -def suds_asdict_recursive(d, convert_types: bool = False) -> dict: +def suds_asdict_recursive(d: Any, convert_types: bool = False) -> dict[str, Value]: """Convert Suds object into serializable format. https://stackoverflow.com/a/15678861 :param d: The input suds object :param convert_types: A boolean to determine if the simple types return should be cast to python types :return: The suds object converted to an OrderedDict """ - out = {} + out: dict[str, Value] = {} for k, v in asdict(d).items(): if hasattr(v, "__keylist__"): out[k] = suds_asdict_recursive(v, convert_types=convert_types) elif isinstance(v, list): - out[k] = [] + items: list[Value] = [] for item in v: if hasattr(item, "__keylist__"): - out[k].append(suds_asdict_recursive(item, convert_types=convert_types)) + items.append(suds_asdict_recursive(item, convert_types=convert_types)) else: - out[k].append(convert_suds_type(item) if convert_types else item) + items.append(convert_suds_type(item) if convert_types else item) + out[k] = items else: out[k] = convert_suds_type(v) if convert_types else v return out diff --git a/src/bfabric/entities/application.py b/src/bfabric/entities/application.py index 30d74a27..2296f3e3 100644 --- a/src/bfabric/entities/application.py +++ b/src/bfabric/entities/application.py @@ -1,11 +1,15 @@ from __future__ import annotations -from typing import Any +from typing import Any, TYPE_CHECKING -from bfabric import Bfabric from bfabric.entities.core.entity import Entity from bfabric.entities.core.has_one import HasOne +if TYPE_CHECKING: + from bfabric import Bfabric + from bfabric.entities.executable import Executable + from bfabric.entities.storage import Storage + class Application(Entity): ENDPOINT = "application" @@ -13,5 +17,5 @@ class Application(Entity): def __init__(self, data_dict: dict[str, Any], client: Bfabric | None) -> None: super().__init__(data_dict=data_dict, client=client) - storage = HasOne("Storage", bfabric_field="storage") - executable = HasOne("Executable", bfabric_field="executable") + storage: HasOne[Storage] = HasOne("Storage", bfabric_field="storage") + executable: HasOne[Executable] = HasOne("Executable", bfabric_field="executable") diff --git a/src/bfabric/entities/core/entity.py b/src/bfabric/entities/core/entity.py index 468018be..abd39793 100644 --- a/src/bfabric/entities/core/entity.py +++ b/src/bfabric/entities/core/entity.py @@ -4,10 +4,12 @@ from loguru import logger -from bfabric import Bfabric from bfabric.experimental import MultiQuery +from bfabric.experimental.entity_lookup_cache import EntityLookupCache if TYPE_CHECKING: + from collections.abc import Iterable + from bfabric import Bfabric from typing import Any, Self @@ -25,6 +27,9 @@ def id(self) -> int: @property def web_url(self) -> str: + if self._client is None: + msg = "Cannot generate a web URL without a client's config information." + raise ValueError(msg) return f"{self._client.config.base_url}/{self.ENDPOINT}/show.html?id={self.id}" @property @@ -39,23 +44,39 @@ def _client(self) -> Bfabric | None: @classmethod def find(cls, id: int, client: Bfabric) -> Self | None: - result = client.read(cls.ENDPOINT, obj={"id": int(id)}) - return cls(result[0], client=client) if len(result) == 1 else None + """Finds an entity by its ID, if it does not exist `None` is returned.""" + cache = EntityLookupCache.instance() + if cache and cache.contains(entity_type=cls, entity_id=id): + return cache.get(entity_type=cls, entity_id=id) + else: + result = client.read(cls.ENDPOINT, obj={"id": int(id)}) + entity = cls(result[0], client=client) if len(result) == 1 else None + if cache: + cache.put(entity_type=cls, entity_id=id, entity=entity) + return entity @classmethod def find_all(cls, ids: list[int], client: Bfabric) -> dict[int, Self]: - ids = [int(id) for id in ids] - if len(ids) > 100: - result = MultiQuery(client).read_multi(cls.ENDPOINT, {}, "id", ids) - else: - result = client.read(cls.ENDPOINT, obj={"id": ids}) - results = {x["id"]: cls(x, client=client) for x in result} - if len(results) != len(ids): - logger.warning(f"Only found {len(results)} out of {len(ids)}.") - return results + """Returns a dictionary of entities with the given IDs. The order will generally match the input, however, + if some entities are not found they will be omitted and a warning will be logged.""" + cache = EntityLookupCache.instance() + ids_requested = cls.__check_ids_list(ids) + + # retrieve entities from cache and from B-Fabric as needed + results_cached = cache.get_all(entity_type=cls, entity_ids=ids) if cache else {} + results_fresh = cls.__retrieve_entities( + client=client, ids_requested=ids_requested, ids_cached=results_cached.keys() + ) + + if cache: + for entity_id, entity in results_fresh.items(): + cache.put(entity_type=cls, entity_id=entity_id, entity=entity) + + return cls.__ensure_results_order(ids_requested, results_cached, results_fresh) @classmethod def find_by(cls, obj: dict[str, Any], client: Bfabric, max_results: int | None = 100) -> dict[int, Self]: + """Returns a dictionary of entities that match the given query.""" result = client.read(cls.ENDPOINT, obj=obj, max_results=max_results) return {x["id"]: cls(x, client=client) for x in result} @@ -78,3 +99,39 @@ def __repr__(self) -> str: return f"{self.__class__.__name__}({repr(self.__data_dict)}, client={repr(self.__client)})" __str__ = __repr__ + + @classmethod + def __check_ids_list(cls, ids: list[int]) -> list[int]: + """Converts the ids to a list of integers (if they are not already) and raises an error if this fails or + there are duplicates.""" + ids_requested = [int(id) for id in ids] + if len(ids_requested) != len(set(ids_requested)): + duplicates = [item for item in set(ids_requested) if ids_requested.count(item) > 1] + raise ValueError(f"Duplicate IDs are not allowed, duplicates: {duplicates}") + return ids_requested + + @classmethod + def __retrieve_entities( + cls, client: Bfabric, ids_requested: list[int], ids_cached: Iterable[int] + ) -> dict[int, Self]: + """Retrieves entities from B-Fabric that are not already in the cache""" + ids = list(set(ids_requested) - set(ids_cached)) + if ids: + if len(ids) > 100: + result = MultiQuery(client).read_multi(cls.ENDPOINT, {}, "id", ids) + else: + result = client.read(cls.ENDPOINT, obj={"id": ids}) + return {x["id"]: cls(x, client=client) for x in result} + else: + return {} + + @classmethod + def __ensure_results_order( + cls, ids_requested: list[int], results_cached: dict[int, Self], results_fresh: dict[int, Self] + ) -> dict[int, Self]: + """Ensures the results are in the same order as requested and prints a warning if some results are missing.""" + results = {**results_cached, **results_fresh} + results = {entity_id: results[entity_id] for entity_id in ids_requested if entity_id in results} + if len(results) != len(ids_requested): + logger.warning(f"Only found {len(results)} out of {len(ids_requested)}.") + return results diff --git a/src/bfabric/entities/core/has_many.py b/src/bfabric/entities/core/has_many.py index aa98d658..22f815fa 100644 --- a/src/bfabric/entities/core/has_many.py +++ b/src/bfabric/entities/core/has_many.py @@ -1,16 +1,21 @@ from __future__ import annotations -from collections.abc import Iterable - +from typing import Generic, TypeVar, TYPE_CHECKING +from bfabric.entities.core.relationship import Relationship from polars import DataFrame -from bfabric import Bfabric -from bfabric.entities.core.entity import Entity +if TYPE_CHECKING: + from collections.abc import Iterator + from bfabric import Bfabric -from bfabric.entities.core.relationship import Relationship + # noinspection PyUnresolvedReferences + from bfabric.entities.core.entity import Entity +E = TypeVar("E", bound="Entity") +T = TypeVar("T") -class HasMany(Relationship): + +class HasMany(Relationship[E]): def __init__( self, entity: str, @@ -26,40 +31,44 @@ def __init__( self._client_property = client_property self._optional = optional - def __get__(self, obj, objtype=None) -> _HasManyProxy: + def __get__(self, obj: T | None, objtype: type[T] | None = None) -> _HasManyProxy: cache_attr = f"_HasMany__{self._ids_property or self._bfabric_field}_cache" + if obj is None: + raise ValueError("Cannot access HasMany relationship on class") if not hasattr(obj, cache_attr): ids = self._get_ids(obj) client = getattr(obj, self._client_property) setattr(obj, cache_attr, _HasManyProxy(entity_type=self._entity_type, ids=ids, client=client)) return getattr(obj, cache_attr) - def _get_ids(self, obj) -> list[int]: - if (self._bfabric_field is None) == (self._ids_property is None): - raise ValueError("Exactly one of bfabric_field and ids_property must be set") + def _get_ids(self, obj: T) -> list[int]: if self._bfabric_field is not None: + if self._ids_property is not None: + raise ValueError("Exactly one of bfabric_field and ids_property must be set, but both are set") if self._optional and self._bfabric_field not in obj.data_dict: return [] return [x["id"] for x in obj.data_dict[self._bfabric_field]] - else: + elif self._ids_property is not None: if self._optional and not hasattr(obj, self._ids_property): return [] return getattr(obj, self._ids_property) + else: + raise ValueError("Exactly one of bfabric_field and ids_property must be set, but neither is set") -class _HasManyProxy: - def __init__(self, entity_type: type[Entity], ids: list[int], client: Bfabric) -> None: +class _HasManyProxy(Generic[E]): + def __init__(self, entity_type: type[E], ids: list[int], client: Bfabric) -> None: self._entity_type = entity_type self._ids = ids self._client = client - self._items = {} + self._items: dict[int, E] = {} @property def ids(self) -> list[int]: return self._ids @property - def list(self) -> list[Entity]: + def list(self) -> list[E]: self._load_all() return sorted(self._items.values(), key=lambda x: self._items.keys()) @@ -68,11 +77,11 @@ def polars(self) -> DataFrame: self._load_all() return DataFrame([x.data_dict for x in self._items.values()]) - def __getitem__(self, key: int) -> Entity: + def __getitem__(self, key: int) -> E: self._load_all() return self._items[key] - def __iter__(self) -> Iterable[Entity]: + def __iter__(self) -> Iterator[E]: self._load_all() return iter(sorted(self._items.values(), key=lambda x: self._items.keys())) diff --git a/src/bfabric/entities/core/has_one.py b/src/bfabric/entities/core/has_one.py index b2957999..2fdc13ee 100644 --- a/src/bfabric/entities/core/has_one.py +++ b/src/bfabric/entities/core/has_one.py @@ -1,22 +1,30 @@ from __future__ import annotations -from bfabric.entities.core.entity import Entity +from typing import TypeVar, TYPE_CHECKING + from bfabric.entities.core.relationship import Relationship +if TYPE_CHECKING: + # noinspection PyUnresolvedReferences + from bfabric.entities.core.entity import Entity + +E = TypeVar("E", bound="Entity") +T = TypeVar("T") + -class HasOne(Relationship): +class HasOne(Relationship[E]): def __init__(self, entity: str, *, bfabric_field: str, optional: bool = False) -> None: super().__init__(entity) self._bfabric_field = bfabric_field self._optional = optional - def __get__(self, obj, objtype=None) -> Entity | None: + def __get__(self, obj: T | None, objtype: type[T] | None = None) -> E | None: cache_attr = f"_HasOne__{self._bfabric_field}_cache" if not hasattr(obj, cache_attr): setattr(obj, cache_attr, self._load_entity(obj=obj)) return getattr(obj, cache_attr) - def _load_entity(self, obj) -> Entity | None: + def _load_entity(self, obj: T) -> E | None: client = obj._client entity_data = obj.data_dict.get(self._bfabric_field) if self._optional and entity_data is None: diff --git a/src/bfabric/entities/core/relationship.py b/src/bfabric/entities/core/relationship.py index ada1e734..f543e63c 100644 --- a/src/bfabric/entities/core/relationship.py +++ b/src/bfabric/entities/core/relationship.py @@ -2,16 +2,21 @@ import importlib from functools import cached_property +from typing import TypeVar, Generic, TYPE_CHECKING -from bfabric.entities.core.entity import Entity +if TYPE_CHECKING: + from bfabric.entities.core.entity import Entity -class Relationship: +E = TypeVar("E", bound="Entity") + + +class Relationship(Generic[E]): def __init__(self, entity: str) -> None: self._entity_type_name = entity @cached_property - def _entity_type(self) -> type[Entity]: + def _entity_type(self) -> type[E]: return importlib.import_module(f"bfabric.entities.{self._entity_type_name.lower()}").__dict__[ self._entity_type_name ] diff --git a/src/bfabric/entities/dataset.py b/src/bfabric/entities/dataset.py index 88a9b46b..4ec1ef10 100644 --- a/src/bfabric/entities/dataset.py +++ b/src/bfabric/entities/dataset.py @@ -1,13 +1,16 @@ from __future__ import annotations +import tempfile from pathlib import Path -from typing import Any +from typing import Any, TYPE_CHECKING from polars import DataFrame -from bfabric import Bfabric from bfabric.entities.core.entity import Entity +if TYPE_CHECKING: + from bfabric import Bfabric + class Dataset(Entity): """Immutable representation of a single dataset in B-Fabric. @@ -31,3 +34,11 @@ def to_polars(self) -> DataFrame: def write_csv(self, path: Path, separator: str = ",") -> None: """Writes the dataset to a csv file at `path`, using the specified column `separator`.""" self.to_polars().write_csv(path, separator=separator) + + def get_csv(self, separator: str = ",") -> str: + """Returns the dataset as a csv string, using the specified column `separator`.""" + with tempfile.NamedTemporaryFile() as tmp_file: + self.write_csv(Path(tmp_file.name), separator=separator) + tmp_file.flush() + tmp_file.seek(0) + return tmp_file.read().decode() diff --git a/src/bfabric/entities/executable.py b/src/bfabric/entities/executable.py index fdbca5b7..7fffc96d 100644 --- a/src/bfabric/entities/executable.py +++ b/src/bfabric/entities/executable.py @@ -1,10 +1,12 @@ from __future__ import annotations -from typing import Any +from typing import Any, TYPE_CHECKING -from bfabric import Bfabric from bfabric.entities.core.entity import Entity +if TYPE_CHECKING: + from bfabric import Bfabric + class Executable(Entity): ENDPOINT = "executable" diff --git a/src/bfabric/entities/externaljob.py b/src/bfabric/entities/externaljob.py index 3b9a6641..146d9dae 100644 --- a/src/bfabric/entities/externaljob.py +++ b/src/bfabric/entities/externaljob.py @@ -3,11 +3,13 @@ from functools import cached_property from typing import Any, TYPE_CHECKING -from bfabric import Bfabric from bfabric.entities.core.entity import Entity +from bfabric.entities.core.has_one import HasOne if TYPE_CHECKING: + from bfabric import Bfabric from bfabric.entities.workunit import Workunit + from bfabric.entities.executable import Executable class ExternalJob(Entity): @@ -16,11 +18,16 @@ class ExternalJob(Entity): def __init__(self, data_dict: dict[str, Any], client: Bfabric | None) -> None: super().__init__(data_dict=data_dict, client=client) + executable: HasOne[Executable] = HasOne(entity="Executable", bfabric_field="executable") + @cached_property def workunit(self) -> Workunit | None: from bfabric.entities.workunit import Workunit if self.data_dict["cliententityclassname"] == "Workunit": + if self._client is None: + raise ValueError("Client must be set to resolve Workunit") + return Workunit.find(id=self.data_dict["cliententityid"], client=self._client) else: return None diff --git a/src/bfabric/entities/multiplexid.py b/src/bfabric/entities/multiplexid.py index e05096bb..8801c7f5 100644 --- a/src/bfabric/entities/multiplexid.py +++ b/src/bfabric/entities/multiplexid.py @@ -1,10 +1,12 @@ from __future__ import annotations -from typing import Any +from typing import Any, TYPE_CHECKING -from bfabric import Bfabric from bfabric.entities.core.entity import Entity +if TYPE_CHECKING: + from bfabric import Bfabric + class MultiplexId(Entity): ENDPOINT = "multiplexid" diff --git a/src/bfabric/entities/multiplexkit.py b/src/bfabric/entities/multiplexkit.py index ed27fd04..6f78de8f 100644 --- a/src/bfabric/entities/multiplexkit.py +++ b/src/bfabric/entities/multiplexkit.py @@ -1,20 +1,24 @@ from __future__ import annotations import polars as pl from functools import cached_property -from typing import Any +from typing import Any, TYPE_CHECKING -from bfabric import Bfabric from bfabric.entities.core.entity import Entity from bfabric.entities.core.has_many import HasMany +if TYPE_CHECKING: + from bfabric import Bfabric + from bfabric.entities.multiplexid import MultiplexId + + class MultiplexKit(Entity): ENDPOINT = "multiplexkit" def __init__(self, data_dict: dict[str, Any], client: Bfabric | None) -> None: super().__init__(data_dict=data_dict, client=client) - multiplex_ids = HasMany("MultiplexId", bfabric_field="multiplexid") + multiplex_ids: HasMany[MultiplexId] = HasMany("MultiplexId", bfabric_field="multiplexid") @cached_property def ids(self) -> pl.DataFrame: diff --git a/src/bfabric/entities/order.py b/src/bfabric/entities/order.py index 43d01534..81b7b713 100644 --- a/src/bfabric/entities/order.py +++ b/src/bfabric/entities/order.py @@ -1,11 +1,14 @@ from __future__ import annotations -from typing import Any +from typing import Any, TYPE_CHECKING -from bfabric import Bfabric from bfabric.entities.core.entity import Entity from bfabric.entities.core.has_one import HasOne +if TYPE_CHECKING: + from bfabric import Bfabric + from bfabric.entities.project import Project + class Order(Entity): ENDPOINT = "order" @@ -13,4 +16,4 @@ class Order(Entity): def __init__(self, data_dict: dict[str, Any], client: Bfabric | None) -> None: super().__init__(data_dict=data_dict, client=client) - project = HasOne("Project", bfabric_field="project") + project: HasOne[Project] = HasOne("Project", bfabric_field="project") diff --git a/src/bfabric/entities/parameter.py b/src/bfabric/entities/parameter.py index 3dc7c9ec..beb93720 100644 --- a/src/bfabric/entities/parameter.py +++ b/src/bfabric/entities/parameter.py @@ -1,10 +1,12 @@ from __future__ import annotations -from typing import Any +from typing import Any, TYPE_CHECKING -from bfabric import Bfabric from bfabric.entities.core.entity import Entity +if TYPE_CHECKING: + from bfabric import Bfabric + class Parameter(Entity): ENDPOINT = "parameter" diff --git a/src/bfabric/entities/project.py b/src/bfabric/entities/project.py index 15f9036d..a85e3e02 100644 --- a/src/bfabric/entities/project.py +++ b/src/bfabric/entities/project.py @@ -1,10 +1,12 @@ from __future__ import annotations -from typing import Any +from typing import Any, TYPE_CHECKING -from bfabric import Bfabric from bfabric.entities.core.entity import Entity +if TYPE_CHECKING: + from bfabric import Bfabric + class Project(Entity): ENDPOINT = "project" diff --git a/src/bfabric/entities/resource.py b/src/bfabric/entities/resource.py index 27b938d7..7e0a40a3 100644 --- a/src/bfabric/entities/resource.py +++ b/src/bfabric/entities/resource.py @@ -1,11 +1,15 @@ from __future__ import annotations -from typing import Any +from typing import Any, TYPE_CHECKING -from bfabric import Bfabric from bfabric.entities.core.entity import Entity from bfabric.entities.core.has_one import HasOne +if TYPE_CHECKING: + from bfabric import Bfabric + from bfabric.entities.storage import Storage + from bfabric.entities.workunit import Workunit + class Resource(Entity): ENDPOINT = "resource" @@ -13,5 +17,5 @@ class Resource(Entity): def __init__(self, data_dict: dict[str, Any], client: Bfabric | None = None) -> None: super().__init__(data_dict=data_dict, client=client) - storage = HasOne("Storage", bfabric_field="storage") - workunit = HasOne("Workunit", bfabric_field="workunit") + storage: HasOne[Storage] = HasOne("Storage", bfabric_field="storage") + workunit: HasOne[Workunit] = HasOne("Workunit", bfabric_field="workunit") diff --git a/src/bfabric/entities/storage.py b/src/bfabric/entities/storage.py index c13cecc2..4d73c5a8 100644 --- a/src/bfabric/entities/storage.py +++ b/src/bfabric/entities/storage.py @@ -2,11 +2,13 @@ from functools import cached_property from pathlib import Path -from typing import Any +from typing import Any, TYPE_CHECKING -from bfabric import Bfabric from bfabric.entities.core.entity import Entity +if TYPE_CHECKING: + from bfabric import Bfabric + class Storage(Entity): ENDPOINT = "storage" diff --git a/src/bfabric/entities/workunit.py b/src/bfabric/entities/workunit.py index e18e6a7c..2fbfdf37 100644 --- a/src/bfabric/entities/workunit.py +++ b/src/bfabric/entities/workunit.py @@ -5,14 +5,19 @@ import dateutil.parser -from bfabric import Bfabric from bfabric.entities.core.entity import Entity from bfabric.entities.core.has_many import HasMany from bfabric.entities.core.has_one import HasOne if TYPE_CHECKING: - from bfabric.entities.project import Project + from bfabric import Bfabric + from bfabric.entities.application import Application + from bfabric.entities.dataset import Dataset + from bfabric.entities.externaljob import ExternalJob from bfabric.entities.order import Order + from bfabric.entities.parameter import Parameter + from bfabric.entities.project import Project + from bfabric.entities.resource import Resource class Workunit(Entity): @@ -25,12 +30,12 @@ class Workunit(Entity): def __init__(self, data_dict: dict[str, Any], client: Bfabric | None = None) -> None: super().__init__(data_dict=data_dict, client=client) - application = HasOne(entity="Application", bfabric_field="application") - parameters = HasMany(entity="Parameter", bfabric_field="parameter") - resources = HasMany(entity="Resource", bfabric_field="resource") - input_resources = HasMany(entity="Resource", bfabric_field="inputresource", optional=True) - input_dataset = HasOne(entity="Dataset", bfabric_field="inputdataset", optional=True) - external_jobs = HasMany(entity="ExternalJob", bfabric_field="externaljob", optional=True) + application: HasOne[Application] = HasOne(entity="Application", bfabric_field="application") + parameters: HasMany[Parameter] = HasMany(entity="Parameter", bfabric_field="parameter") + resources: HasMany[Resource] = HasMany(entity="Resource", bfabric_field="resource") + input_resources: HasMany[Resource] = HasMany(entity="Resource", bfabric_field="inputresource", optional=True) + input_dataset: HasOne[Dataset] = HasOne(entity="Dataset", bfabric_field="inputdataset", optional=True) + external_jobs: HasMany[ExternalJob] = HasMany(entity="ExternalJob", bfabric_field="externaljob", optional=True) @cached_property def parameter_values(self) -> dict[str, Any]: @@ -41,16 +46,29 @@ def container(self) -> Project | Order: from bfabric.entities.project import Project from bfabric.entities.order import Order + if self._client is None: + raise ValueError("Cannot determine the container without a client.") + + result: Project | Order | None if self.data_dict["container"]["classname"] == Project.ENDPOINT: - return Project.find(id=self.data_dict["container"]["id"], client=self._client) + result = Project.find(id=self.data_dict["container"]["id"], client=self._client) elif self.data_dict["container"]["classname"] == Order.ENDPOINT: - return Order.find(id=self.data_dict["container"]["id"], client=self._client) + result = Order.find(id=self.data_dict["container"]["id"], client=self._client) else: raise ValueError(f"Unknown container classname: {self.data_dict['container']['classname']}") + if result is None: + raise ValueError(f"Could not find container with ID {self.data_dict['container']['id']}") + + return result + @cached_property def store_output_folder(self) -> Path: """Relative path in the storage for the workunit output.""" + if self.application is None: + raise ValueError("Cannot determine the storage path without an application.") + if self.application.storage is None: + raise ValueError("Cannot determine the storage path without an application storage configuration.") date = dateutil.parser.parse(self.data_dict["created"]) return Path( f"{self.application.storage['projectfolderprefix']}{self.container.id}", diff --git a/src/bfabric/errors.py b/src/bfabric/errors.py index 28545748..a8d2a634 100644 --- a/src/bfabric/errors.py +++ b/src/bfabric/errors.py @@ -3,7 +3,7 @@ from typing import Any -class BfabricRequestError(Exception): +class BfabricRequestError(RuntimeError): """An error that is returned by the server in response to a full request.""" def __init__(self, message: str) -> None: diff --git a/src/bfabric/experimental/app_interface/app_runner/__main__.py b/src/bfabric/experimental/app_interface/app_runner/__main__.py deleted file mode 100644 index c70d6b08..00000000 --- a/src/bfabric/experimental/app_interface/app_runner/__main__.py +++ /dev/null @@ -1,39 +0,0 @@ -from __future__ import annotations - -import argparse -from pathlib import Path -from typing import Union - -import yaml -from pydantic import TypeAdapter - -from bfabric.bfabric import Bfabric -from bfabric.cli_formatting import setup_script_logging -from bfabric.experimental.app_interface.app_runner._spec import AppSpec -from bfabric.experimental.app_interface.app_runner.runner import run_app - - -def main() -> None: - setup_script_logging() - client = Bfabric.from_config() - parser = argparse.ArgumentParser() - parser.add_argument("action", default="run", choices=["run"]) - parser.add_argument("--app-spec", type=Path, required=True) - parser.add_argument("--workunit-ref", type=TypeAdapter(Union[int, Path]).validate_strings, required=True) - parser.add_argument("--work-dir", type=Path, required=True) - parser.add_argument("--ssh-user", type=str, required=False) - parser.add_argument("--read-only", action="store_true") - args = parser.parse_args() - app_spec = AppSpec.model_validate(yaml.safe_load(args.app_spec.read_text())) - run_app( - app_spec=app_spec, - workunit_ref=args.workunit_ref, - work_dir=args.work_dir, - client=client, - ssh_user=args.ssh_user, - read_only=args.read_only, - ) - - -if __name__ == "__main__": - main() diff --git a/src/bfabric/experimental/app_interface/cli/__main__.py b/src/bfabric/experimental/app_interface/cli/__main__.py deleted file mode 100644 index 435743e4..00000000 --- a/src/bfabric/experimental/app_interface/cli/__main__.py +++ /dev/null @@ -1,19 +0,0 @@ -from __future__ import annotations - -import cyclopts - -from bfabric.experimental.app_interface.cli.app import app_app -from bfabric.experimental.app_interface.cli.chunk import app_chunk -from bfabric.experimental.app_interface.cli.inputs import app_inputs -from bfabric.experimental.app_interface.cli.outputs import app_outputs -from bfabric.experimental.app_interface.cli.validate import app_validate - -app = cyclopts.App() -app.command(app_inputs) -app.command(app_outputs) -app.command(app_app) -app.command(app_chunk) -app.command(app_validate) - -if __name__ == "__main__": - app() diff --git a/src/bfabric/experimental/app_interface/cli/inputs.py b/src/bfabric/experimental/app_interface/cli/inputs.py deleted file mode 100644 index 83c37851..00000000 --- a/src/bfabric/experimental/app_interface/cli/inputs.py +++ /dev/null @@ -1,68 +0,0 @@ -from __future__ import annotations - -from pathlib import Path - -import cyclopts - -from bfabric import Bfabric -from bfabric.cli_formatting import setup_script_logging -from bfabric.experimental.app_interface.input_preparation import prepare_folder -from bfabric.experimental.app_interface.input_preparation.prepare import print_input_files_list - -app_inputs = cyclopts.App("inputs", help="Prepare input files for an app.") - - -@app_inputs.command() -def prepare( - inputs_yaml: Path, - target_folder: Path | None = None, - *, - ssh_user: str | None = None, -) -> None: - """Prepare the input files by downloading them (if necessary). - - :param inputs_yaml: Path to the inputs.yml file. - :param target_folder: Path to the target folder where the input files should be downloaded. - :param ssh_user: SSH user to use for downloading the input files, instead of the current user. - """ - setup_script_logging() - client = Bfabric.from_config() - prepare_folder( - inputs_yaml=inputs_yaml, - target_folder=target_folder, - ssh_user=ssh_user, - client=client, - action="prepare", - ) - - -@app_inputs.command() -def clean( - inputs_yaml: Path, - target_folder: Path | None = None, -) -> None: - """Removes all local copies of input files. - - :param inputs_yaml: Path to the inputs.yml file. - :param target_folder: Path to the target folder where the input files should be removed. - """ - setup_script_logging() - client = Bfabric.from_config() - # TODO clean shouldn't even need all these arguments, this could be refactored later - prepare_folder( - inputs_yaml=inputs_yaml, - target_folder=target_folder, - ssh_user=None, - action="clean", - client=client, - ) - - -@app_inputs.command() -def list( - inputs_yaml: Path, - target_folder: Path | None = None, -) -> None: - """Lists the input files for an app.""" - setup_script_logging() - print_input_files_list(inputs_yaml=inputs_yaml, target_folder=target_folder) diff --git a/src/bfabric/experimental/app_interface/input_preparation/__main__.py b/src/bfabric/experimental/app_interface/input_preparation/__main__.py deleted file mode 100644 index ec8879b6..00000000 --- a/src/bfabric/experimental/app_interface/input_preparation/__main__.py +++ /dev/null @@ -1,30 +0,0 @@ -from __future__ import annotations - -import argparse -from pathlib import Path - -from bfabric.bfabric import Bfabric -from bfabric.cli_formatting import setup_script_logging -from bfabric.experimental.app_interface.input_preparation.prepare import prepare_folder - - -def main() -> None: - setup_script_logging() - client = Bfabric.from_config() - parser = argparse.ArgumentParser() - parser.add_argument("action", default="prepare", choices=["prepare", "clean"]) - parser.add_argument("--inputs-yaml", type=Path, required=True) - parser.add_argument("--target-folder", type=Path, required=False) - parser.add_argument("--ssh-user", type=str, required=False) - args = parser.parse_args() - prepare_folder( - inputs_yaml=args.inputs_yaml, - target_folder=args.target_folder, - ssh_user=args.ssh_user, - client=client, - action=args.action, - ) - - -if __name__ == "__main__": - main() diff --git a/src/bfabric/experimental/app_interface/input_preparation/prepare.py b/src/bfabric/experimental/app_interface/input_preparation/prepare.py deleted file mode 100644 index 8876e9b0..00000000 --- a/src/bfabric/experimental/app_interface/input_preparation/prepare.py +++ /dev/null @@ -1,143 +0,0 @@ -from __future__ import annotations - -import tempfile -from pathlib import Path - -from loguru import logger -from rich.console import Console -from rich.table import Table, Column - -from bfabric.bfabric import Bfabric -from bfabric.entities import Resource, Dataset -from bfabric.experimental.app_interface.input_preparation._spec import ( - ResourceSpec, - DatasetSpec, - InputSpecType, - InputsSpec, -) -from bfabric.experimental.app_interface.util.checksums import md5sum -from bfabric.experimental.app_interface.util.scp import scp - - -class PrepareInputs: - def __init__(self, client: Bfabric, working_dir: Path, ssh_user: str | None) -> None: - self._client = client - self._working_dir = working_dir - self._ssh_user = ssh_user - - def prepare_all(self, specs: list[InputSpecType]) -> None: - for spec in specs: - logger.debug(f"Preparing {spec}") - if isinstance(spec, ResourceSpec): - self.prepare_resource(spec) - elif isinstance(spec, DatasetSpec): - self.prepare_dataset(spec) - else: - raise ValueError(f"Unknown spec type: {type(spec)}") - - def clean_all(self, specs: list[InputSpecType]) -> None: - for spec in specs: - logger.debug(f"Cleaning {spec}") - if isinstance(spec, ResourceSpec): - self.clean_resource(spec) - elif isinstance(spec, DatasetSpec): - self.clean_dataset(spec) - else: - raise ValueError(f"Unknown spec type: {type(spec)}") - - def prepare_resource(self, spec: ResourceSpec) -> None: - resource = Resource.find(id=spec.id, client=self._client) - - # determine path to copy from - scp_uri = f"{resource.storage.scp_prefix}{resource['relativepath']}" - - # determine path to copy to - result_name = spec.filename if spec.filename else resource["name"] - result_path = self._working_dir / result_name - - # copy if necessary - if result_path.exists() and md5sum(result_path) == resource["filechecksum"]: - logger.debug(f"Skipping {resource['name']} as it already exists and has the correct checksum") - else: - scp(scp_uri, str(result_path), user=self._ssh_user) - - # verify checksum - if spec.check_checksum: - actual_checksum = md5sum(result_path) - logger.debug(f"Checksum: expected {resource['filechecksum']}, got {actual_checksum}") - if actual_checksum != resource["filechecksum"]: - raise ValueError(f"Checksum mismatch: expected {resource['filechecksum']}, got {actual_checksum}") - - def prepare_dataset(self, spec: DatasetSpec) -> None: - dataset = Dataset.find(id=spec.id, client=self._client) - target_path = self._working_dir / spec.filename - target_path.parent.mkdir(exist_ok=True, parents=True) - with tempfile.NamedTemporaryFile() as tmp_file: - dataset.write_csv(Path(tmp_file.name), separator=spec.separator) - tmp_file.flush() - tmp_file.seek(0) - if target_path.exists() and target_path.read_text() == tmp_file.read().decode(): - logger.debug(f"Skipping {spec.filename} as it already exists and has the correct content") - else: - tmp_file.seek(0) - target_path.write_text(tmp_file.read().decode()) - - def clean_resource(self, spec: ResourceSpec) -> None: - name = spec.filename if spec.filename else Resource.find(id=spec.id, client=self._client)["name"] - path = self._working_dir / name - if path.exists(): - logger.info(f"Removing {path}") - path.unlink() - else: - logger.debug(f"Resource {path} does not exist") - - def clean_dataset(self, spec: DatasetSpec) -> None: - path = self._working_dir / spec.filename - if path.exists(): - logger.info(f"Removing {path}") - path.unlink() - else: - logger.debug(f"Dataset {path} does not exist") - - -def prepare_folder( - inputs_yaml: Path, target_folder: Path | None, client: Bfabric, ssh_user: str | None, action: str = "prepare" -) -> None: - # set defaults - inputs_yaml = inputs_yaml.absolute() - if target_folder is None: - target_folder = inputs_yaml.parent - - # parse the specs - specs_list = InputsSpec.read_yaml(inputs_yaml) - - # prepare the folder - prepare = PrepareInputs(client=client, working_dir=target_folder, ssh_user=ssh_user) - if action == "prepare": - prepare.prepare_all(specs=specs_list) - elif action == "clean": - prepare.clean_all(specs=specs_list) - else: - raise ValueError(f"Unknown action: {action}") - - -def print_input_files_list( - inputs_yaml: Path, - target_folder: Path, -) -> None: - """Prints a list of inputs and whether they exist locally.""" - specs_list = InputsSpec.read_yaml(inputs_yaml) - table = Table( - Column("File"), - Column("Input Type"), - Column("Exists Locally"), - ) - for spec in specs_list: - path = target_folder / spec.filename if target_folder else Path(spec.filename) - table.add_row( - str(path), - "Resource" if isinstance(spec, ResourceSpec) else "Dataset", - "Yes" if path.exists() else "No", - ) - console = Console() - console.print(table) diff --git a/src/bfabric/experimental/app_interface/output_registration/__main__.py b/src/bfabric/experimental/app_interface/output_registration/__main__.py deleted file mode 100644 index ee0938d0..00000000 --- a/src/bfabric/experimental/app_interface/output_registration/__main__.py +++ /dev/null @@ -1,23 +0,0 @@ -import argparse -from pathlib import Path - -from bfabric import Bfabric -from bfabric.cli_formatting import setup_script_logging -from bfabric.experimental.app_interface.output_registration import register_outputs - - -def main() -> None: - setup_script_logging() - client = Bfabric.from_config() - parser = argparse.ArgumentParser() - parser.add_argument("action", default="register", choices=["register"]) - parser.add_argument("--outputs-yaml", type=Path, required=True) - parser.add_argument("--workunit-id", type=int, required=True) - parser.add_argument("--ssh-user", type=str, required=False) - args = parser.parse_args() - register_outputs( - outputs_yaml=args.outputs_yaml, - workunit_id=args.workunit_id, - client=client, - ssh_user=args.ssh_user, - ) diff --git a/src/bfabric/experimental/entity_lookup_cache.py b/src/bfabric/experimental/entity_lookup_cache.py new file mode 100644 index 00000000..24c13c17 --- /dev/null +++ b/src/bfabric/experimental/entity_lookup_cache.py @@ -0,0 +1,103 @@ +from __future__ import annotations + +from collections import defaultdict, OrderedDict +from contextlib import contextmanager +from typing import TypeVar, Generic, TYPE_CHECKING + +from loguru import logger + +if TYPE_CHECKING: + from collections.abc import Generator + from collections.abc import Hashable + from bfabric.entities.core.entity import Entity # type: ignore + +T = TypeVar("T") +E = TypeVar("E", bound="Entity") + + +class Cache(Generic[T]): + """A FIFO cache with a maximum size, implemented by an OrderedDict.""" + + def __init__(self, max_size: int) -> None: + self._entries: OrderedDict[Hashable, T] = OrderedDict() + self._max_size = max_size + + def get(self, key: Hashable) -> T | None: + """Returns the value with the given key, if it exists, and marks it as used. + + If the key does not exist, returns None. + """ + if key in self._entries: + self._entries.move_to_end(key) + return self._entries[key] + + def put(self, key: Hashable, value: T) -> None: + """Puts a key-value pair into the cache, marking it as used.""" + if self._max_size != 0 and len(self._entries) >= self._max_size: + self._entries.popitem(last=False) + self._entries[key] = value + + def __contains__(self, key: Hashable) -> bool: + """Returns whether the cache contains a key.""" + return key in self._entries + + +class EntityLookupCache: + """Implements the logic for caching entity lookup. + + :param max_size: The maximum size of the cache. If 0, the cache has no size limit. + """ + + __class_instance = None + + def __init__(self, max_size: int = 0) -> None: + self._caches: dict[type[Entity], Cache[Entity | None]] = defaultdict(lambda: Cache(max_size=max_size)) + + def contains(self, entity_type: type[Entity], entity_id: int) -> bool: + """Returns whether the cache contains an entity with the given type and ID.""" + return entity_id in self._caches[entity_type] + + def get(self, entity_type: type[E], entity_id: int) -> E | None: + """Returns the entity with the given type and ID, if it exists in the cache.""" + if self._caches[entity_type].get(entity_id): + logger.debug(f"Cache hit for entity {entity_type} with ID {entity_id}") + return self._caches[entity_type].get(entity_id) + else: + logger.debug(f"Cache miss for entity {entity_type} with ID {entity_id}") + return None + + def get_all(self, entity_type: type[Entity], entity_ids: list[int]) -> dict[int, Entity]: + """Returns a dictionary of entities with the given type and IDs, + containing only the entities that exist in the cache. + """ + results = {entity_id: self.get(entity_type, entity_id) for entity_id in entity_ids} + return {entity_id: result for entity_id, result in results.items() if result is not None} + + def put(self, entity_type: type[Entity], entity_id: int, entity: Entity | None) -> None: + """Puts an entity with the given type and ID into the cache.""" + logger.debug(f"Caching entity {entity_type} with ID {entity_id}") + self._caches[entity_type].put(entity_id, entity) + + @classmethod + @contextmanager + def enable(cls, max_size: int = 0) -> Generator[None, None, None]: + """Context manager that enables the EntityLookupCache singleton instance, i.e. every entity lookup by ID + within this context will be cached. The cache is cleared after the context exits. + """ + existing_cache = cls.__class_instance is not None + if not existing_cache: + cls.__class_instance = cls(max_size=max_size) + # TODO what to do if existing_cache and max_size mismatch? + # TODO another relevant use case could be selectively caching only some entities, whereas others should be + # reloaded + # TODO finally, there is the question about persistent caches (e.g. storages do not change that often) + try: + yield + finally: + if not existing_cache: + cls.__class_instance = None + + @classmethod + def instance(cls) -> EntityLookupCache | None: + """Returns the singleton instance of the EntityLookupCache.""" + return cls.__class_instance diff --git a/src/bfabric/experimental/app_interface/workunit/definition.py b/src/bfabric/experimental/workunit_definition.py similarity index 69% rename from src/bfabric/experimental/app_interface/workunit/definition.py rename to src/bfabric/experimental/workunit_definition.py index 440385e4..55719f75 100644 --- a/src/bfabric/experimental/app_interface/workunit/definition.py +++ b/src/bfabric/experimental/workunit_definition.py @@ -1,14 +1,16 @@ from __future__ import annotations from pathlib import Path -from typing import Literal +from typing import Literal, TYPE_CHECKING import yaml from pydantic import BaseModel, ConfigDict, model_validator -from bfabric import Bfabric from bfabric.entities import Workunit +if TYPE_CHECKING: + from bfabric import Bfabric + class WorkunitExecutionDefinition(BaseModel): """Defines the execution details of a workunit.""" @@ -16,6 +18,7 @@ class WorkunitExecutionDefinition(BaseModel): model_config = ConfigDict(extra="forbid") raw_parameters: dict[str, str | None] + # TODO drop the execuctable executable: Path dataset: int | None = None resources: list[int] = [] @@ -32,6 +35,10 @@ def mutually_exclusive_dataset_resources(self) -> WorkunitExecutionDefinition: @classmethod def from_workunit(cls, workunit: Workunit) -> WorkunitExecutionDefinition: """Loads the workunit execution definition from the provided B-Fabric workunit.""" + if workunit.application is None: + raise ValueError("Workunit does not have an application") + if workunit.application.executable is None: + raise ValueError("Workunit application does not have an executable") data = { "raw_parameters": workunit.parameter_values, "executable": workunit.application.executable["program"], @@ -71,15 +78,29 @@ class WorkunitDefinition(BaseModel): registration: WorkunitRegistrationDefinition | None @classmethod - def from_ref(cls, workunit: Path | int, client: Bfabric) -> WorkunitDefinition: - """Loads the workunit definition from the provided reference, - which can be a path to a YAML file, or a workunit ID. + def from_ref(cls, workunit: Path | int, client: Bfabric, cache_file: Path | None = None) -> WorkunitDefinition: + """Loads the workunit definition from the provided reference, which can be a path to a YAML file, + or a workunit ID. + + If the cache file is provided and exists, it will be loaded directly instead of resolving the reference. + Otherwise, the result will be cached to the provided file. + :param workunit: The workunit reference, which can be a path to a YAML file, or a workunit ID. + :param client: The B-Fabric client to use for resolving the workunit. + :param cache_file: The path to the cache file, if any. """ + if cache_file is not None and cache_file.exists(): + return cls.from_yaml(cache_file) if isinstance(workunit, Path): - return cls.from_yaml(workunit) + result = cls.from_yaml(workunit) else: - workunit = Workunit.find(id=workunit, client=client) - return cls.from_workunit(workunit) + workunit_instance = Workunit.find(id=workunit, client=client) + if workunit_instance is None: + raise ValueError(f"Workunit with ID {workunit} does not exist") + result = cls.from_workunit(workunit=workunit_instance) + if cache_file is not None: + cache_file.parent.mkdir(exist_ok=True, parents=True) + result.to_yaml(cache_file) + return result @classmethod def from_workunit(cls, workunit: Workunit) -> WorkunitDefinition: diff --git a/src/bfabric/py.typed b/src/bfabric/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/src/bfabric/results/response_format_dict.py b/src/bfabric/results/response_format_dict.py index fb7e0cc5..18bc8d3f 100644 --- a/src/bfabric/results/response_format_dict.py +++ b/src/bfabric/results/response_format_dict.py @@ -1,6 +1,7 @@ from __future__ import annotations from copy import deepcopy +from typing import Any, overload def sort_dict(d: dict) -> dict: @@ -33,14 +34,22 @@ def _recursive_drop_empty(response_elem: list | dict) -> None: del response_elem[k] -def drop_empty_elements(response: list | dict, inplace: bool = True) -> list | dict | None: +@overload +def drop_empty_elements(response: list[dict[str, Any]], inplace: bool) -> list[dict[str, Any]]: ... + + +@overload +def drop_empty_elements(response: dict[str, Any], inplace: bool) -> dict[str, Any]: ... + + +def drop_empty_elements(response: list | dict, inplace: bool = True) -> list | dict: """ Iterates over all nested lists, dictionaries and basic values. Whenever a dictionary value is encountered, that is either an empty list or None, the key-value pair gets deleted from the dictionary :param response: A parsed query response, consisting of nested lists, dicts and basic types (int, str) :param inplace: If true, will return nothing and edit the argument. Otherwise, will preserve the argument and return an edited copy - :return: Nothing, or an edited response, depending on `inplace` + :return: An edited response, depending on `inplace` """ response_filtered = deepcopy(response) if not inplace else response _recursive_drop_empty(response_filtered) diff --git a/src/bfabric/results/result_container.py b/src/bfabric/results/result_container.py index 9ac30185..b113a318 100644 --- a/src/bfabric/results/result_container.py +++ b/src/bfabric/results/result_container.py @@ -1,20 +1,24 @@ from __future__ import annotations import logging -from typing import Any, TYPE_CHECKING -from collections.abc import Iterable +from typing import Any, TYPE_CHECKING, overload import bfabric.results.response_format_dict as formatter if TYPE_CHECKING: + from collections.abc import Iterator import polars + from bfabric.errors import BfabricRequestError class ResultContainer: """Container structure for query results.""" def __init__( - self, results: list[dict[str, Any]], total_pages_api: int | None = None, errors: list | None = None + self, + results: list[dict[str, Any]], + total_pages_api: int | None = None, + errors: list[BfabricRequestError] | None = None, ) -> None: """ :param results: List of BFabric query results @@ -28,10 +32,16 @@ def __init__( self._total_pages_api = total_pages_api self._errors = errors or [] - def __getitem__(self, idx: int) -> dict[str, Any]: + @overload + def __getitem__(self, idx: int) -> dict[str, Any]: ... + + @overload + def __getitem__(self, idx: slice) -> list[dict[str, Any]]: ... + + def __getitem__(self, idx: int | slice) -> dict[str, Any] | list[dict[str, Any]]: return self.results[idx] - def __iter__(self) -> Iterable[dict[str, Any]]: + def __iter__(self) -> Iterator[dict[str, Any]]: return iter(self.results) def __repr__(self) -> str: @@ -61,8 +71,8 @@ def is_success(self) -> bool: return len(self._errors) == 0 @property - def errors(self) -> list: - """List of errors that occurred during the query. An empty list means the query was successful.""" + def errors(self) -> list[BfabricRequestError]: + """List of errors that occurred during the query. An empty list indicates success.""" return self._errors def extend(self, other: ResultContainer, reset_total_pages_api: bool = False) -> None: diff --git a/src/bfabric/utils/math_helper.py b/src/bfabric/utils/math_helper.py deleted file mode 100644 index 7e20278f..00000000 --- a/src/bfabric/utils/math_helper.py +++ /dev/null @@ -1,9 +0,0 @@ -def div_int_ceil(n: int, d: int) -> int: - """ - :param n: Numerator - :param d: Denominator - :return: Performs integer ceiling division - Theoretically equivalent to math.ceil(n/d), but not subject to floating-point errors. - """ - q, r = divmod(n, d) - return q + bool(r) diff --git a/src/bfabric/utils/paginator.py b/src/bfabric/utils/paginator.py index f20312b3..d4004982 100644 --- a/src/bfabric/utils/paginator.py +++ b/src/bfabric/utils/paginator.py @@ -1,12 +1,16 @@ from __future__ import annotations import math +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Generator # Single page query limit for BFabric API (as of time of writing, adapt if it changes) BFABRIC_QUERY_LIMIT = 100 -def page_iter(objs: list, page_size: int = BFABRIC_QUERY_LIMIT) -> list: +def page_iter(objs: list, page_size: int = BFABRIC_QUERY_LIMIT) -> Generator[list, None, None]: """ :param objs: A list of objects to provide to bfabric as part of a query :param page_size: Number of objects per page diff --git a/src/bfabric/wrapper_creator/bfabric_submitter.py b/src/bfabric/wrapper_creator/bfabric_submitter.py index 8417f38a..7ef35c1c 100644 --- a/src/bfabric/wrapper_creator/bfabric_submitter.py +++ b/src/bfabric/wrapper_creator/bfabric_submitter.py @@ -1,10 +1,14 @@ +from __future__ import annotations + import base64 +from pathlib import Path import yaml +from loguru import logger -import bfabric.wrapper_creator.gridengine as gridengine -import bfabric.wrapper_creator.slurm as slurm -from bfabric.wrapper_creator.bfabric_external_job import BfabricExternalJob +from bfabric.bfabric import Bfabric +from bfabric.entities import ExternalJob, Executable +from bfabric.wrapper_creator.slurm import SLURM class BfabricSubmitter: @@ -12,12 +16,6 @@ class BfabricSubmitter: the class is used by the submitter which is executed by the bfabric system. """ - (G, B) = (None, None) - - workunitid = None - workunit = None - parameters = None - execfilelist = [] slurm_dict = { "MaxQuant_textfiles_sge": {"partition": "prx", "nodelist": "fgcz-r-033", "memory": "1G"}, "fragpipe": {"partition": "prx", "nodelist": "fgcz-r-033", "memory": "256G"}, @@ -30,91 +28,53 @@ class BfabricSubmitter: def __init__( self, - login=None, - password=None, - externaljobid=None, - user="*", - node="PRX@fgcz-r-018", - partition="prx", - nodelist="fgcz-r-028", - memory="10G", - SCHEDULEROOT="/export/bfabric/bfabric/", - scheduler="GridEngine", - ): - """ - :rtype : object - """ - self.B = BfabricExternalJob(login=login, password=password, externaljobid=externaljobid) + client: Bfabric, + externaljobid: int, + user: str = "*", + partition: str = "prx", + nodelist: str = "fgcz-r-028", + memory: str = "10G", + scheduleroot: str = "/export/bfabric/bfabric/", + scheduler: str = "GridEngine", + script_dir: Path = Path("/home/bfabric/prx"), + ) -> None: + self._client = client + self._executable_file_list = [] + self.partition = partition self.nodelist = nodelist self.memory = memory - self.SCHEDULEROOT = SCHEDULEROOT - self.user = user + self.scheduleroot = scheduleroot self.scheduler = scheduler - - print(self.B.auth.login) - print(self.B.externaljobid) - - self.workunitid = self.B.get_workunitid_of_externaljob() - - try: - self.workunit = self.B.read_object(endpoint="workunit", obj={"id": self.workunitid})[0] - except: - print("ERROR: could not fetch workunit while calling constructor in BfabricSubmitter.") - raise - - try: - self.parameters = [ - self.B.read_object(endpoint="parameter", obj={"id": x._id})[0] for x in self.workunit.parameter - ] - except: - self.parameters = list() - print("Warning: could not fetch parameter.") - - partition = [x for x in self.parameters if x.key == "partition"] - nodelist = [x for x in self.parameters if x.key == "nodelist"] - memory = [x for x in self.parameters if x.key == "memory"] - application_name = self.B.get_application_name() - - if len(partition) > 0 and len(nodelist) > 0 and len(memory) > 0: - self.partition = partition[0].value - self.nodelist = nodelist[0].value - self.memory = memory[0].value - elif "queue" in [x.key for x in self.parameters] and application_name in self.slurm_dict: - # Temporary check for old workunit previously run with SGE - self.partition = self.slurm_dict[application_name]["partition"] - self.nodelist = self.slurm_dict[application_name]["nodelist"] - self.memory = self.slurm_dict[application_name]["memory"] - else: - pass - - print(f"partition={self.partition}") - print(f"nodelist={self.nodelist}") - print(f"memory={self.memory}") - print("__init__ DONE") - - def submit_gridengine(self, script="/tmp/runme.bash", arguments=""): - GE = gridengine.GridEngine(user=self.user, queue=self.queue, GRIDENGINEROOT=self.SCHEDULEROOT) - - print(script) - print(type(script)) - resQsub = GE.qsub(script=script, arguments=arguments) - - self.B.logger(f"{resQsub}") - - def submit_slurm(self, script="/tmp/runme.bash", arguments=""): - SL = slurm.SLURM(user=self.user, SLURMROOT=self.SCHEDULEROOT) - - print(script) - print(type(script)) - resSbatch = SL.sbatch(script=script, arguments=arguments) - - self.B.logger(f"{resSbatch}") - - def compose_bash_script(self, configuration=None, configuration_parser=lambda x: yaml.safe_load(x)): + self.user = user + self._script_dir = script_dir + + self.external_job = ExternalJob.find(id=externaljobid, client=client) + self.workunit = self.external_job.workunit + self.parameters = self.workunit.parameter_values + self.application = self.workunit.application + + default_config = self.slurm_dict.get(self.application["name"], {}) + self.partition = self.parameters.get("partition", default_config.get("partition")) + self.nodelist = self.parameters.get("nodelist", default_config.get("nodelist")) + self.memory = self.parameters.get("memory", default_config.get("memory")) + + logger.debug(f"partition={self.partition}") + logger.debug(f"nodelist={self.nodelist}") + logger.debug(f"memory={self.memory}") + logger.debug("__init__ DONE") + + def submit_slurm(self, script: str = "/tmp/runme.bash") -> None: + slurm = SLURM(slurm_root=self.scheduleroot) + logger.debug(script) + logger.debug(type(script)) + res_slurm_batch = slurm.sbatch(script=script) + logger.debug(f"{res_slurm_batch}") + + def compose_bash_script(self, configuration=None, configuration_parser=lambda x: yaml.safe_load(x)) -> str: """ composes the bash script which is executed by the submitter (sun grid engine). - as argument it takes a configuration file, e.g., yaml, xml, json, or whatsoever, and a parser function. + as an argument it takes a configuration file, e.g., yaml, xml, json, or whatsoever, and a parser function. it returns a str object containing the code. @@ -122,11 +82,7 @@ def compose_bash_script(self, configuration=None, configuration_parser=lambda x: """ # assert isinstance(configuration, str) - - try: - config = configuration_parser(configuration) - except: - raise ValueError("error: parsing configuration content failed.") + config = configuration_parser(configuration) _cmd_template = """#!/bin/bash # Maria d'Errico @@ -135,33 +91,33 @@ def compose_bash_script(self, configuration=None, configuration_parser=lambda x: # 2020-09-29 # https://GitHub.com/fgcz/bfabricPy/ # Slurm -#SBATCH --partition={0} -#SBATCH --nodelist={11} +#SBATCH --partition={partition} +#SBATCH --nodelist={nodelist} #SBATCH -n 1 #SBATCH -N 1 #SBATCH --cpus-per-task=1 -#SBATCH --mem-per-cpu={12} -#SBATCH -e {1} -#SBATCH -o {2} -#SBATCH --job-name=WU{10} +#SBATCH --mem-per-cpu={memory} +#SBATCH -e {stderr_url} +#SBATCH -o {stdout_url} +#SBATCH --job-name=WU{workunit_id} #SBATCH --workdir=/home/bfabric #SBATCH --export=ALL,HOME=/home/bfabric # Grid Engine Parameters -#$ -q {0}&{11} -#$ -e {1} -#$ -o {2} +#$ -q {partition}&{nodelist} +#$ -e {stderr_url} +#$ -o {stdout_url} set -e set -o pipefail export EMAIL="{job_notification_emails}" -export EXTERNALJOB_ID={3} -export RESSOURCEID_OUTPUT={4} -export RESSOURCEID_STDOUT_STDERR="{5} {6}" -export OUTPUT="{7}" -export WORKUNIT_ID="{10}" +export EXTERNALJOB_ID={external_job_id} +export RESSOURCEID_OUTPUT={resource_id_output} +export RESSOURCEID_STDOUT_STDERR="{resource_id_stderr} {resource_id_stdout}" +export OUTPUT="{output_list}" +export WORKUNIT_ID="{workunit_id}" STAMP=`/bin/date +%Y%m%d%H%M`.$$.$JOB_ID TEMPDIR="/home/bfabric/prx" @@ -175,14 +131,14 @@ def compose_bash_script(self, configuration=None, configuration_parser=lambda x: if [ $? -eq 1 ]; then - echo "writting to output url failed!"; + echo "writing to output url failed!"; exit 1; fi # job configuration set by B-Fabrics wrapper_creator executable # application parameter/configuration cat > $TEMPDIR/config_WU$WORKUNIT_ID.yaml < None: """ implements the default submitter the function fetches the yaml base64 configuration file linked to the external job id out of the B-Fabric - system. Since the file can not be stagged to the LRMS as argument, we copy the yaml file into the bash script + system. Since the file can not be staged to the LRMS as argument, we copy the yaml file into the bash script and stage it on execution the application. TODO(cp): create the output url before the application is started. return None """ - - # foreach (executable in external job): - for executable in self.B.get_executable_of_externaljobid(): - self.B.logger(f"executable = {executable}") - - try: - content = base64.b64decode(executable.base64.encode()).decode() - except: - raise ValueError("error: decoding executable.base64 failed.") - - print(content) + executables = Executable.find_by({"workunitid": self.workunit.id}, client=self._client).values() + for executable in executables: + if not executable["base64"]: + continue + + logger.debug(f"executable = {executable}") + content = base64.b64decode(executable["base64"].encode()).decode() + logger.debug(content) _cmd_template = self.compose_bash_script( configuration=content, configuration_parser=lambda x: yaml.safe_load(x) ) - _bash_script_filename = f"/home/bfabric/prx/workunitid-{self.B.get_workunitid_of_externaljob()}_externaljobid-{self.B.externaljobid}_executableid-{executable._id}.bash" + bash_script_file = Path( + self._script_dir, + f"workunitid-{self.workunit.id}_externaljobid-{self.external_job.id}" + f"_executableid-{self.external_job.executable.id}.bash", + ) - with open(_bash_script_filename, "w") as f: - f.write(_cmd_template) + bash_script_file.write_text(_cmd_template) if self.scheduler == "GridEngine": - self.submit_gridengine(_bash_script_filename) + raise NotImplementedError + # self.submit_gridengine(bash_script_file) else: - self.submit_slurm(_bash_script_filename) - self.execfilelist.append(_bash_script_filename) + self.submit_slurm(str(bash_script_file)) + self._executable_file_list.append(str(bash_script_file)) - res = self.B.save_object(endpoint="externaljob", obj={"id": self.B.externaljobid, "status": "done"}) + self._client.save("externaljob", {"id": self.external_job.id, "status": "done"}) - def get_job_script(self): - return self.execfilelist + def get_job_script(self) -> list[str]: + return self._executable_file_list diff --git a/src/bfabric/wrapper_creator/bfabric_wrapper_creator.py b/src/bfabric/wrapper_creator/bfabric_wrapper_creator.py index 104f8611..e0e59959 100644 --- a/src/bfabric/wrapper_creator/bfabric_wrapper_creator.py +++ b/src/bfabric/wrapper_creator/bfabric_wrapper_creator.py @@ -1,20 +1,16 @@ from __future__ import annotations import base64 -import datetime -import json from collections import defaultdict from functools import cached_property from pathlib import Path from typing import Any, Literal -import yaml +from loguru import logger from bfabric import Bfabric -from bfabric.bfabric_legacy import bfabricEncoder from bfabric.entities import Workunit, ExternalJob, Application, Resource, Storage, Order, Project -from bfabric.experimental.app_interface.workunit.definition import WorkunitDefinition -from bfabric.wrapper_creator.bfabric_external_job import BfabricExternalJob +from bfabric.experimental.workunit_definition import WorkunitDefinition class BfabricWrapperCreator: @@ -45,6 +41,7 @@ def _log_storage(self) -> Storage: def create_output_resource(self) -> Resource: # Since we use the id of the output resource in the path, we have to save it twice. + logger.info("Creating output resource") n_input_resource = len(self._workunit.input_resources) resource_id = self._client.save( "resource", @@ -62,10 +59,12 @@ def create_output_resource(self) -> Resource: relative_path = str(output_folder / output_filename) # Save the path + logger.info("Saving correct path") result = self._client.save("resource", {"id": resource_id, "relativepath": relative_path}) return Resource(result[0]) def create_log_resource(self, variant: Literal["out", "err"], output_resource: Resource) -> Resource: + logger.info("Creating log resource") result = self._client.save( "resource", { @@ -78,6 +77,7 @@ def create_log_resource(self, variant: Literal["out", "err"], output_resource: R return Resource(result[0]) def get_application_section(self, output_resource: Resource) -> dict[str, Any]: + logger.info("Creating application section") output_url = f"bfabric@{self._application.storage.data_dict['host']}:{self._application.storage.data_dict['basepath']}{output_resource.data_dict['relativepath']}" inputs = defaultdict(list) for resource in Resource.find_all(self.workunit_definition.execution.resources, client=self._client).values(): @@ -94,6 +94,7 @@ def get_application_section(self, output_resource: Resource) -> dict[str, Any]: def get_job_configuration_section( self, output_resource: Resource, stdout_resource: Resource, stderr_resource: Resource ) -> dict[str, Any]: + logger.info("Creating job configuration section") log_resource = {} for name, resource in [("stdout", stdout_resource), ("stderr", stderr_resource)]: @@ -143,7 +144,8 @@ def _fasta_sequence(self) -> str: else: return "" - def write_results(self, config_serialized: str) -> None: + def write_results(self, config_serialized: str) -> tuple[dict[str, Any], dict[str, Any]]: + logger.info("Saving executable") yaml_workunit_executable = self._client.save( "executable", { @@ -155,6 +157,7 @@ def write_results(self, config_serialized: str) -> None: "version": "10", }, )[0] + logger.info("Saving external job") yaml_workunit_externaljob = self._client.save( "externaljob", { @@ -163,292 +166,14 @@ def write_results(self, config_serialized: str) -> None: "executableid": yaml_workunit_executable["id"], "action": "WORKUNIT", }, - ) + )[0] # TODO now i am a bit confused, the external_job_id that is added to the .yml file is not the original one # but rather the one from the yaml_workunit_externaljob. I am not sure if we need this as it makes the # code here a lot more complex - print(yaml_workunit_externaljob) + logger.info(yaml_workunit_externaljob) + logger.info("Setting external job status to 'done'") self._client.save("externaljob", {"id": self._external_job_id, "status": "done"}) - -class BfabricWrapperCreatorOld(BfabricExternalJob): - """ - the class is used for the wrapper_creator which is executed by the bfabtic system - (non batch) so each resource is processed seperate - """ - - (externaljobid_submitter, workunit_executableid) = (None, None) - - def get_externaljobid_yaml_workunit(self): - return self.externaljobid_yaml_workunit - - def get_executableid(self): - return self.workunit_executableid - - def write_yaml(self, data_serializer=lambda x: yaml.dump(x, default_flow_style=False, encoding=None)): - """ - This method writes all related parameters into a yaml file which is than upload as base64 encoded - file into the b-fabric system. - - if the method does not excepted at the end it reports also the status of the external_job. - - TODO(cp): make this function more generic so that it can also export xml, json, yaml, ... - """ - - # Inherits all parameters of the application executable out of B-Fabric to create an executable script - workunitid = self.get_workunitid_of_externaljob() - - if workunitid is None: - raise ValueError("no workunit available for the given externaljobid.") - - workunit = self.read_object(endpoint="workunit", obj={"id": workunitid})[0] - if workunit is None: - raise ValueError("ERROR: no workunit available for the given externaljobid.") - - assert isinstance(workunit._id, int) - - application = self.read_object("application", obj={"id": workunit.application._id})[0] - # TODO(cp): rename to application_execuatbel - workunit_executable = self.read_object("executable", obj={"id": workunit.applicationexecutable._id})[0] - try: - self.workunit_executableid = workunit_executable._id - except: - self.workunit_executableid = None - - # Get container details - container = workunit.container - fastasequence = "" - if container._classname == "order": - order = self.read_object("order", obj={"id": container._id})[0] - order_id = order._id - if "project" in order: # noqa - project_id = order.project._id - else: - project_id = None - if "fastasequence" in order: - fastasequence = "\n".join([x.strip() for x in str(order.fastasequence).split("\r")]) - else: - order_id = None - project_id = container._id - - today = datetime.date.today() - - # merge all information into the executable script - _output_storage = self.read_object("storage", obj={"id": application.storage._id})[0] - - _output_relative_path = "p{0}/bfabric/{1}/{2}/{3}/workunit_{4}/".format( # noqa - container._id, - application.technology.replace(" ", "_"), - application.name.replace(" ", "_"), - today.strftime("%Y/%Y-%m/%Y-%m-%d/"), - workunitid, - ) - - # Setup the log_storage to SlurmLog with id 13 - _log_storage = self.read_object("storage", obj={"id": 13})[0] - - # _cmd_applicationList = [workunit_executable.program] - - application_parameter = {} - - if getattr(workunit, "parameter", None) is not None: - for para in workunit.parameter: - parameter = self.read_object("parameter", obj={"id": para._id}) - if parameter: - for p in parameter: - try: - application_parameter[f"{p.key}"] = f"{p.value}" - except: - application_parameter[f"{p.key}"] = "" - - try: - input_resources = [x._id for x in workunit.inputresource] - input_resources = [self.read_object(endpoint="resource", obj={"id": x})[0] for x in input_resources] - except: - print("no input resources found. continue with empty list.") - input_resources = [] - - # query all urls and ids of the input resources - resource_urls = dict() - resource_ids = dict() - - for resource_iterator in input_resources: - try: - _appication_id = self.read_object(endpoint="workunit", obj={"id": resource_iterator.workunit._id})[ - 0 - ].application._id - - _application_name = f"{self.read_object('application', obj={'id': _appication_id})[0].name}" - - _storage = self.read_object("storage", {"id": resource_iterator.storage._id})[0] - - _inputUrl = f"bfabric@{_storage.host}:/{_storage.basepath}/{resource_iterator.relativepath}" - - if _application_name not in resource_urls: - resource_urls[_application_name] = [] - resource_ids[_application_name] = [] - - resource_urls[_application_name].append(_inputUrl) - - sample_id = self.get_sampleid(int(resource_iterator._id)) - - _resource_sample = { - "resource_id": int(resource_iterator._id), - "resource_url": f"{self.config.base_url}/userlab/show-resource.html?id={resource_iterator._id}", - } - - if sample_id is not None: - _resource_sample["sample_id"] = int(sample_id) - _resource_sample["sample_url"] = f"{self.config.base_url}/userlab/show-sample.html?id={sample_id}" - - resource_ids[_application_name].append(_resource_sample) - except: - print("resource_iterator failed. continue ...") - pass - - # create resources for output, stderr, stdout - _ressource_output = self.save_object( - "resource", - { - "name": f"{application.name} {len(input_resources)} - resource", - "workunitid": workunit._id, - "storageid": int(application.storage._id), - "relativepath": _output_relative_path, - }, - )[0] - - print(_ressource_output) - _output_filename = f"{_ressource_output._id}.{application.outputfileformat}" - # we want to include the resource._id into the filename - _ressource_output = self.save_object( - "resource", - { - "id": int(_ressource_output._id), - "relativepath": f"{_output_relative_path}/{_output_filename}", - }, - )[0] - - print(_ressource_output) - _resource_stderr = self.save_object( - "resource", - { - "name": "slurm_stderr", - "workunitid": int(workunit._id), - "storageid": _log_storage._id, - "relativepath": f"/workunitid-{workunit._id}_resourceid-{_ressource_output._id}.err", - }, - )[0] - - _resource_stdout = self.save_object( - "resource", - { - "name": "slurm_stdout", - "workunitid": workunit._id, - "storageid": _log_storage._id, - "relativepath": f"/workunitid-{workunit._id}_resourceid-{_ressource_output._id}.out", - }, - )[0] - - # Creates the workunit executable - # The config includes the externaljobid: the yaml_workunit_externaljob has to be created before it. - # The yaml_workunit_externaljob cannot be created without specifying an executableid: - # a yaml_workunit_executable is thus created before the config definition in order to provide - # the correct executableid to the yaml_workunit_externaljob. - # However this yaml_workunit_executable has to be updated later to include 'base64': base64.b64encode(config_serialized.encode()).decode() - yaml_workunit_executable = self.save_object( - "executable", - { - "name": "job configuration (executable) in YAML", - "context": "WORKUNIT", - "workunitid": workunit._id, - "description": "This is a job configuration as YAML base64 encoded. It is configured to be executed by the B-Fabric yaml submitter.", - }, - )[0] - print(yaml_workunit_executable) - - yaml_workunit_externaljob = self.save_object( - "externaljob", - { - "workunitid": workunit._id, - "status": "new", - "executableid": yaml_workunit_executable._id, - "action": "WORKUNIT", - }, - )[0] - print(yaml_workunit_externaljob) - assert isinstance(yaml_workunit_externaljob._id, int) - self.externaljobid_yaml_workunit = int(yaml_workunit_externaljob._id) - print(f"XXXXXXX self.externaljobid_yaml_workunit ={self.externaljobid_yaml_workunit} XXXXXXX") - - _output_url = ( - f"bfabric@{_output_storage.host}:{_output_storage.basepath}{_output_relative_path}/{_output_filename}" - ) - - try: - query_obj = {"id": workunit.inputdataset._id} - inputdataset = self.read_object(endpoint="dataset", obj=query_obj)[0] - inputdataset_json = json.dumps(inputdataset, cls=bfabricEncoder, sort_keys=True, indent=2) - inputdataset = json.loads(inputdataset_json) - except: - inputdataset = None - - # Compose configuration structure - config = { - "job_configuration": { - "executable": f"{workunit_executable.program}", - "inputdataset": inputdataset, - "input": resource_ids, - "output": { - "protocol": "scp", - "resource_id": int(_ressource_output._id), - "ssh_args": "-o StrictHostKeyChecking=no -2 -l bfabric -x", - }, - "stderr": { - "protocol": "file", - "resource_id": int(_resource_stderr._id), - "url": f"{_log_storage.basepath}/workunitid-{workunit._id}_resourceid-{_ressource_output._id}.err", - }, - "stdout": { - "protocol": "file", - "resource_id": int(_resource_stdout._id), - "url": f"{_log_storage.basepath}/workunitid-{workunit._id}_resourceid-{_ressource_output._id}.out", - }, - "workunit_id": int(workunit._id), - "workunit_createdby": str(workunit.createdby), - "workunit_url": f"{self.config.base_url}/userlab/show-workunit.html?workunitId={workunit._id}", - "external_job_id": int(yaml_workunit_externaljob._id), - "order_id": order_id, - "project_id": project_id, - "fastasequence": fastasequence, - }, - "application": { - "protocol": "scp", - "parameters": application_parameter, - "input": resource_urls, - "output": [_output_url], - }, - } - - config_serialized = data_serializer(config) - print(config_serialized) - - yaml_workunit_executable = self.save_object( - "executable", - { - "id": yaml_workunit_executable._id, - "base64": base64.b64encode(config_serialized.encode()).decode(), - "version": f"{10}", - }, - )[0] - print(yaml_workunit_executable) - - # The WrapperCreator executable is successful, and the status of the its external job is set to done, - # which triggers B-Fabric to create an external job for the submitter executable. - - wrapper_creator_externaljob = self.save_object( - endpoint="externaljob", obj={"id": self.externaljobid, "status": "done"} - ) - - print(f"\n\nquery_counter={self.query_counter}") + return yaml_workunit_executable, yaml_workunit_externaljob diff --git a/src/bfabric/wrapper_creator/slurm.py b/src/bfabric/wrapper_creator/slurm.py old mode 100755 new mode 100644 index f7683115..2315b492 --- a/src/bfabric/wrapper_creator/slurm.py +++ b/src/bfabric/wrapper_creator/slurm.py @@ -1,12 +1,13 @@ -#! /usr/bin/env python +from __future__ import annotations + +from loguru import logger + """ Interface to the SLURM (Simple Linux Utility for Resources Management) resource manager and job scheduler 2020-09-28 Maria d'Errico Christian Panse - -$HeadURL: http://fgcz-svn.uzh.ch/repos/scripts/trunk/linux/bfabric/apps/python/bfabric/slurm.py $ """ # Copyright (C) 2011, 2012 ETH Zurich and University of Zurich. All rights reserved. @@ -29,46 +30,39 @@ # limitations under the License. # -__docformat__ = "reStructuredText" -# __version__ = '$Revision: 2463 $' - +from pathlib import Path import os import subprocess class SLURM: - """ - interface to Slurm sbatch - """ - - def __init__(self, user="*", SLURMROOT="/usr/"): - """ - Set up parameters for querying Slurm. + """Wrapper for SLURM, providing a Python interface to `sbatch`. - SLURMROOT is essential. - """ - - self.user = user - self.sbatchbin = f"{SLURMROOT}/bin/sbatch" + The `slurm_root` variable will be passed as `SLURMROOT` to the environment, when submitting the script, and is an + important parameter which needs to be set correctly for our scripts to function properly. + """ - os.environ["SLURM_ROOT"] = SLURMROOT + def __init__(self, slurm_root: str | Path = "/usr/") -> None: + self._slurm_root = Path(slurm_root) + self._sbatch_bin = self._slurm_root / "bin/sbatch" - def sbatch(self, script, arguments=""): - """ - todo: pass stderr and stdout file location as argument + def sbatch(self, script: str | Path) -> tuple[str, str] | None: + """Submits the script to SLURM using `sbatch`. + If successful, returns a tuple with the stdout and stderr of the submission. """ - sbatch_cmd = [self.sbatchbin, script, " ".join(arguments)] - - if not os.path.isfile(self.sbatchbin): - print(f"{self.sbatchbin} can not be found.") + script = Path(script) + if not script.is_file(): + logger.error(f"Script not found: {script}") return - - if not os.path.isfile(script): - print(f"'{script}' - no such file.") + if not self._sbatch_bin.is_file(): + logger.error(f"sbatch binary not found: {self._sbatch_bin}") return - sbatch_process = subprocess.Popen(sbatch_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False) - result = [x.decode("utf-8") for x in sbatch_process.communicate()] - - return "".join(result) + env = os.environ | {"SLURMROOT": self._slurm_root} + result = subprocess.run( + [self._sbatch_bin, script], env=env, check=True, shell=False, capture_output=True, encoding="utf-8" + ) + # TODO the code initially had a TODO to write these two to a file, in general I think the logs of the squeue + # are currently not written to a file at all. + return result.stdout, result.stderr diff --git a/src/bfabric_scripts/bfabric_executable_submitter_slurm.py b/src/bfabric_scripts/bfabric_executable_submitter_slurm.py index 1f4bf375..ae3208e5 100755 --- a/src/bfabric_scripts/bfabric_executable_submitter_slurm.py +++ b/src/bfabric_scripts/bfabric_executable_submitter_slurm.py @@ -4,6 +4,11 @@ Submitter for B-Fabric """ +from argparse import ArgumentParser + +from bfabric import Bfabric +from bfabric.wrapper_creator.bfabric_submitter import BfabricSubmitter + # Copyright (C) 2014,2015 Functional Genomics Center Zurich ETHZ|UZH. All rights reserved. # Modified to submit to the Slurm scheduler on 2020-09-28 # @@ -31,35 +36,13 @@ """ -# import os -# import sys -from optparse import OptionParser -from bfabric import BfabricSubmitter - - def main() -> None: - parser = OptionParser(usage="usage: %prog -j ", version="%prog 1.0") - - parser.add_option( - "-j", - "--externaljobid", - type="int", - action="store", - dest="externaljobid", - default=None, - help="external job id is required.", - ) - - (options, args) = parser.parse_args() - - if not options.externaljobid: - parser.error("option '-j' is required.") - - bfapp = BfabricSubmitter(externaljobid=options.externaljobid, SCHEDULEROOT="/usr/", scheduler="Slurm") - + parser = ArgumentParser(help="Submitter for B-Fabric") + parser.add_argument("-j", "--externaljobid", type=int) + args = parser.parse_args() + client = Bfabric.from_config() + bfapp = BfabricSubmitter(client=client, externaljobid=args.externaljobid, scheduleroot="/usr/", scheduler="Slurm") bfapp.submitter_yaml() - # TODO(cp): fix that - # print(bfapp.query_counter) if __name__ == "__main__": diff --git a/src/bfabric_scripts/bfabric_list_workunit_parameters.py b/src/bfabric_scripts/bfabric_list_workunit_parameters.py index 88439847..b7f17e54 100644 --- a/src/bfabric_scripts/bfabric_list_workunit_parameters.py +++ b/src/bfabric_scripts/bfabric_list_workunit_parameters.py @@ -1,11 +1,12 @@ import argparse import json -import sys import polars as pl import rich from bfabric import Bfabric +from bfabric.cli_formatting import setup_script_logging +from bfabric.experimental import MultiQuery def bfabric_list_workunit_parameters(client: Bfabric, application_id: int, max_workunits: int, format: str) -> None: @@ -74,16 +75,13 @@ def print_results(format: str, merged_result: pl.DataFrame) -> None: def get_parameter_table(client: Bfabric, workunits_table_explode: pl.DataFrame) -> pl.DataFrame: """Returns a wide format table for the specified parameters, with the key `workunit_id` indicating the source.""" # load the parameters table - collect = [] - for i_frame, frame in enumerate(workunits_table_explode.iter_slices(100)): - print( - f"-- Reading parameters chunk {i_frame + 1} of {len(workunits_table_explode) // 100 + 1}", file=sys.stderr - ) - chunk = ( - client.read("parameter", {"id": frame["parameter_id"].to_list()}).to_polars().rename({"id": "parameter_id"}) - ) - collect.append(chunk) - parameter_table_full = pl.concat(collect, how="align")[["parameter_id", "key", "value"]] + collect = MultiQuery(client=client).read_multi( + endpoint="parameter", + obj={}, + multi_query_key="id", + multi_query_vals=workunits_table_explode["parameter_id"].to_list(), + ) + parameter_table_full = collect.to_polars().rename({"id": "parameter_id"})[["parameter_id", "key", "value"]] # add workunit id to parameter table parameter_table_full = parameter_table_full.join( workunits_table_explode[["workunit_id", "parameter_id"]], on="parameter_id", how="left" @@ -94,6 +92,7 @@ def get_parameter_table(client: Bfabric, workunits_table_explode: pl.DataFrame) def main() -> None: """Parses command line arguments and calls `bfabric_list_workunit_parameters`.""" + setup_script_logging() client = Bfabric.from_config() parser = argparse.ArgumentParser() parser.add_argument("application_id", type=int, help="The application ID to list the workunit parameters for.") diff --git a/src/bfabric_scripts/bfabric_save_resource_description.py b/src/bfabric_scripts/bfabric_save_resource_description.py new file mode 100644 index 00000000..23e54903 --- /dev/null +++ b/src/bfabric_scripts/bfabric_save_resource_description.py @@ -0,0 +1,28 @@ +import argparse +from pathlib import Path + +from rich.pretty import pprint + +from bfabric import Bfabric +from bfabric.cli_formatting import setup_script_logging + + +def save_resource_description(client: Bfabric, id: int, description_file: Path) -> None: + description = description_file.read_text() + obj = {"id": id, "description": description} + response = client.save(endpoint="resource", obj=obj) + pprint(response[0], indent_guides=False) + + +def main() -> None: + setup_script_logging() + parser = argparse.ArgumentParser() + parser.add_argument("id", type=int) + parser.add_argument("description_file", type=Path) + client = Bfabric.from_config() + args = parser.parse_args() + save_resource_description(client=client, **vars(args)) + + +if __name__ == "__main__": + main() diff --git a/src/bfabric_scripts/bfabric_slurm_queue_status.py b/src/bfabric_scripts/bfabric_slurm_queue_status.py index a9f57159..6df0ae06 100644 --- a/src/bfabric_scripts/bfabric_slurm_queue_status.py +++ b/src/bfabric_scripts/bfabric_slurm_queue_status.py @@ -5,13 +5,13 @@ import json import shlex import subprocess -import sys import polars as pl +import sys from loguru import logger from bfabric import Bfabric -from bfabric.entities import Workunit +from bfabric.entities import Workunit, Application def get_slurm_jobs(partition: str, ssh_host: str | None) -> pl.DataFrame: @@ -33,12 +33,26 @@ def get_slurm_jobs(partition: str, ssh_host: str | None) -> pl.DataFrame: return df.with_columns(workunit_id=pl.when(string_id_expr.is_not_null()).then(string_id_expr.cast(int))) -def get_workunit_status(client: Bfabric, workunit_ids: list[int]) -> dict[int, str]: - """Returns the status of the workunits with the specified ids, by consoluting the bfabric API. +def get_workunit_infos(client: Bfabric, workunit_ids: list[int]) -> list[dict[str, str]]: + """Retrieves information about the workunits with the specified ids. If a workunit was deleted, but it is in the slurm queue, it will be considered a zombie. """ + # Find the workunits which actually exist. workunits = Workunit.find_all(ids=workunit_ids, client=client) - return {id: workunits[id].data_dict["status"] if id in workunits else "ZOMBIE" for id in workunit_ids} + + # Retrieve application id -> name mapping. + app_ids = {wu["application"]["id"] for wu in workunits.values()} + apps = Application.find_all(ids=list(app_ids), client=client) + app_names = {app["id"]: app["name"] for app in apps.values()} + + return [ + { + "workunit_id": id, + "status": workunits[id].data_dict["status"] if id in workunits else "ZOMBIE", + "application_name": app_names[workunits[id]["application"]["id"]] if id in workunits else "N/A", + } + for id in workunit_ids + ] def find_zombie_jobs(client: Bfabric, partition: str, ssh_host: str | None) -> pl.DataFrame: @@ -46,14 +60,14 @@ def find_zombie_jobs(client: Bfabric, partition: str, ssh_host: str | None) -> p slurm_jobs = get_slurm_jobs(partition=partition, ssh_host=ssh_host) if slurm_jobs.is_empty(): return pl.DataFrame() - workunit_status = get_workunit_status( - client=client, workunit_ids=slurm_jobs["workunit_id"].drop_nulls().cast(int).to_list() + workunit_info_table = pl.DataFrame( + get_workunit_infos(client=client, workunit_ids=slurm_jobs["workunit_id"].drop_nulls().cast(int).to_list()) ) - workunit_status_table = pl.from_dict(dict(workunit_id=workunit_status.keys(), status=workunit_status.values())) - logger.info(slurm_jobs.join(workunit_status_table, on="workunit_id", how="left").sort("workunit_id")) - logger.info(f"Active jobs: {workunit_status_table.height}") - logger.info(f"Found {workunit_status_table.filter(pl.col('status') == 'ZOMBIE').height} zombie jobs.") - return workunit_status_table.filter(pl.col("status") == "ZOMBIE") + pl.Config.set_tbl_rows(100) + logger.info(slurm_jobs.join(workunit_info_table, on="workunit_id", how="left").sort("workunit_id")) + logger.info(f"Active jobs: {workunit_info_table.height}") + logger.info(f"Found {workunit_info_table.filter(pl.col('status') == 'ZOMBIE').height} zombie jobs.") + return workunit_info_table.filter(pl.col("status") == "ZOMBIE") def main() -> None: diff --git a/src/bfabric_scripts/bfabric_upload_submitter_executable.py b/src/bfabric_scripts/bfabric_upload_submitter_executable.py index ba872875..052a4d77 100755 --- a/src/bfabric_scripts/bfabric_upload_submitter_executable.py +++ b/src/bfabric_scripts/bfabric_upload_submitter_executable.py @@ -17,16 +17,6 @@ # Licensed under GPL version 3 # # -# Usage: bfabric_upload_submitter_executable.py [-h] filename {slurm,gridengine} -# -# Arguments for new submitter executable. For more details run: -# ./bfabric_upload_submitter_executable.py --help -# -# positional arguments: -# filename Bash executable of the submitter -# {slurm,gridengine} Valid engines for job handling are: slurm, gridengine -# -# # Example of use: # # For bfabric.__version__ < 0.10.22 @@ -42,31 +32,46 @@ # # ./bfabric_upload_submitter_executable.py bfabric_executable_submitter_functionalTest.py slurm --name "Dummy_-_yaml___Slurm_executable" --description "test new submitter's parameters" # +from __future__ import annotations import argparse import base64 +from pathlib import Path import yaml from bfabric import Bfabric - - -def main_upload_submitter_executable(options) -> None: - executableFileName = options.filename - engine = options.engine - - client = Bfabric.from_config() - - with open(executableFileName) as f: - executable = f.read() +from bfabric.cli_formatting import setup_script_logging + + +def slurm_parameters() -> list[dict[str, str]]: + parameters = [{"modifiable": "true", "required": "true", "type": "STRING"} for _ in range(3)] + parameters[0]["description"] = "Which Slurm partition should be used." + parameters[0]["enumeration"] = ["prx"] + parameters[0]["key"] = "partition" + parameters[0]["label"] = "partition" + parameters[0]["value"] = "prx" + parameters[1]["description"] = "Which Slurm nodelist should be used." + parameters[1]["enumeration"] = ["fgcz-r-033"] + parameters[1]["key"] = "nodelist" + parameters[1]["label"] = "nodelist" + parameters[1]["value"] = "fgcz-r-[035,028]" + parameters[2]["description"] = "Which Slurm memory should be used." + parameters[2]["enumeration"] = ["10G", "50G", "128G", "256G", "512G", "960G"] + parameters[2]["key"] = "memory" + parameters[2]["label"] = "memory" + parameters[2]["value"] = "10G" + return parameters + + +def main_upload_submitter_executable( + client: Bfabric, filename: Path, engine: str, name: str | None, description: str | None +) -> None: + executable = filename.read_text() attr = { "context": "SUBMITTER", - "parameter": [ - {"modifiable": "true", "required": "true", "type": "STRING"}, - {"modifiable": "true", "required": "true", "type": "STRING"}, - {"modifiable": "true", "required": "true", "type": "STRING"}, - ], + "parameter": [], "masterexecutableid": 11871, "status": "available", "enabled": "true", @@ -75,49 +80,17 @@ def main_upload_submitter_executable(options) -> None: } if engine == "slurm": - attr["name"] = "yaml / Slurm executable" - attr["parameter"][0]["description"] = "Which Slurm partition should be used." - attr["parameter"][0]["enumeration"] = ["prx", "maxquant", "scaffold", "mascot"] - attr["parameter"][0]["key"] = "partition" - attr["parameter"][0]["label"] = "partition" - attr["parameter"][0]["value"] = "prx" - attr["parameter"][1]["description"] = "Which Slurm nodelist should be used." - attr["parameter"][1]["enumeration"] = [ - "fgcz-r-[035,028]", - "fgcz-r-035", - "fgcz-r-033", - "fgcz-r-028", - "fgcz-r-018", - ] - attr["parameter"][1]["key"] = "nodelist" - attr["parameter"][1]["label"] = "nodelist" - attr["parameter"][1]["value"] = "fgcz-r-[035,028]" - attr["parameter"][2]["description"] = "Which Slurm memory should be used." - attr["parameter"][2]["enumeration"] = ["10G", "50G", "128G", "256G", "512G", "960G"] - attr["parameter"][2]["key"] = "memory" - attr["parameter"][2]["label"] = "memory" - attr["parameter"][2]["value"] = "10G" - attr["version"] = 1.02 - attr["description"] = "Stage the yaml config file to application using Slurm." - elif engine == "gridengine": - attr["name"] = "yaml / Grid Engine executable" - attr["parameter"][0]["description"] = "Which Grid Engine partition should be used." - attr["parameter"][0]["enumeration"] = "PRX" - attr["parameter"][0]["key"] = "partition" - attr["parameter"][0]["label"] = "partition" - attr["parameter"][0]["value"] = "PRX" - attr["parameter"][1]["description"] = "Which Grid Engine node should be used." - attr["parameter"][1]["enumeration"] = ["fgcz-r-033", "fgcz-r-028", "fgcz-r-018"] - attr["parameter"][1]["key"] = "nodelist" - attr["parameter"][1]["label"] = "nodelist" - attr["parameter"][1]["value"] = "fgcz-r-028" - attr["version"] = 1.00 - attr["description"] = "Stage the yaml config file to an application using Grid Engine." - - if options.name: - attr["name"] = options.name - if options.description: - attr["description"] = options.description + name = name or "yaml / Slurm executable" + description = description or "Submitter executable for the bfabric functional test using Slurm." + attr["version"] = "1.03" + attr["parameter"] = slurm_parameters() + else: + raise NotImplementedError + + if name: + attr["name"] = name + if description: + attr["description"] = description res = client.save("executable", attr) print(yaml.dump(res)) @@ -125,18 +98,20 @@ def main_upload_submitter_executable(options) -> None: def main() -> None: """Parses command line arguments and calls `main_upload_submitter_executable`.""" + setup_script_logging() + client = Bfabric.from_config() parser = argparse.ArgumentParser() - parser.add_argument("filename", type=str, help="Bash executable of the submitter") + parser.add_argument("filename", type=Path, help="Bash executable of the submitter") parser.add_argument( "engine", type=str, - choices=["slurm", "gridengine"], + choices=["slurm"], help="Valid engines for job handling are: slurm, gridengine", ) parser.add_argument("--name", type=str, help="Name of the submitter", required=False) parser.add_argument("--description", type=str, help="Description about the submitter", required=False) options = parser.parse_args() - main(options) + main_upload_submitter_executable(client=client, **vars(options)) if __name__ == "__main__": diff --git a/src/bfabric_scripts/py.typed b/src/bfabric_scripts/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/tests/app_runner/__init__.py b/tests/app_runner/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/app_runner/specs/__init__.py b/tests/app_runner/specs/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/app_runner/specs/test_app_spec.py b/tests/app_runner/specs/test_app_spec.py new file mode 100644 index 00000000..d520f6e0 --- /dev/null +++ b/tests/app_runner/specs/test_app_spec.py @@ -0,0 +1,54 @@ +import pytest +import yaml + +from app_runner.specs.app_spec import AppSpec, CommandShell, CommandDocker, MountOptions, CommandsSpec + + +@pytest.fixture() +def parsed() -> AppSpec: + return AppSpec( + commands=CommandsSpec( + dispatch=CommandShell(command="dispatch"), + process=CommandDocker( + image="image", command="command", mounts=MountOptions(read_only=[("/host", "/container")]) + ), + collect=CommandShell(command="collect"), + ), + reuse_default_resource=True, + ) + + +@pytest.fixture() +def serialized() -> str: + return """commands: + collect: + command: collect + type: shell + dispatch: + command: dispatch + type: shell + process: + command: command + custom_args: [] + engine: docker + entrypoint: null + env: {} + image: image + mac_address: null + mounts: + read_only: + - - /host + - /container + share_bfabric_config: true + work_dir_target: null + writeable: [] + type: docker +reuse_default_resource: true""" + + +def test_serialize(parsed, serialized): + assert yaml.safe_dump(parsed.model_dump(mode="json")).strip() == serialized.strip() + + +def test_parse(parsed, serialized): + assert AppSpec.model_validate(yaml.safe_load(serialized)) == parsed diff --git a/tests/app_runner/specs/test_inputs_spec.py b/tests/app_runner/specs/test_inputs_spec.py new file mode 100644 index 00000000..ea802ddb --- /dev/null +++ b/tests/app_runner/specs/test_inputs_spec.py @@ -0,0 +1,43 @@ +import pytest +import yaml + +from app_runner.specs.inputs_spec import InputsSpec, ResourceSpec, DatasetSpec + + +@pytest.fixture() +def parsed() -> InputsSpec: + return InputsSpec( + inputs=[ + ResourceSpec( + id=1, + filename="filename", + check_checksum=True, + ), + DatasetSpec( + id=2, + filename="filename", + separator=",", + ), + ] + ) + + +@pytest.fixture() +def serialized() -> str: + return """inputs: +- check_checksum: true + filename: filename + id: 1 + type: bfabric_resource +- filename: filename + id: 2 + separator: ',' + type: bfabric_dataset""" + + +def test_serialize(parsed, serialized): + assert yaml.safe_dump(parsed.model_dump(mode="json")).strip() == serialized.strip() + + +def test_parse(parsed, serialized): + assert InputsSpec.model_validate(yaml.safe_load(serialized)) == parsed diff --git a/tests/app_runner/specs/test_outputs_spec.py b/tests/app_runner/specs/test_outputs_spec.py new file mode 100644 index 00000000..66bd13b2 --- /dev/null +++ b/tests/app_runner/specs/test_outputs_spec.py @@ -0,0 +1,47 @@ +import pytest +import yaml + +from app_runner.specs.outputs_spec import OutputsSpec, CopyResourceSpec, SaveDatasetSpec + + +@pytest.fixture() +def parsed() -> OutputsSpec: + return OutputsSpec( + outputs=[ + CopyResourceSpec( + local_path="local_path", + store_entry_path="store_entry_path", + store_folder_path=None, + update_existing="no", + protocol="scp", + ), + SaveDatasetSpec( + local_path="local_path", separator="separator", name=None, has_header=True, invalid_characters="" + ), + ] + ) + + +@pytest.fixture() +def serialized() -> str: + return """outputs: +- local_path: local_path + protocol: scp + store_entry_path: store_entry_path + store_folder_path: null + type: bfabric_copy_resource + update_existing: 'no' +- has_header: true + invalid_characters: '' + local_path: local_path + name: null + separator: separator + type: bfabric_dataset""" + + +def test_serialize(parsed, serialized): + assert yaml.safe_dump(parsed.model_dump(mode="json")).strip() == serialized.strip() + + +def test_parse(parsed, serialized): + assert OutputsSpec.model_validate(yaml.safe_load(serialized)) == parsed diff --git a/tests/bfabric/entities/core/test_entity.py b/tests/bfabric/entities/core/test_entity.py index 706214b1..7ed028fc 100644 --- a/tests/bfabric/entities/core/test_entity.py +++ b/tests/bfabric/entities/core/test_entity.py @@ -69,6 +69,13 @@ def test_find_all_when_not_all_found(mocker, mock_client) -> None: mock_client.read.assert_called_once_with("test_endpoint", obj={"id": [1, 5]}) +def test_find_all_when_empty_list(mock_client) -> None: + entities = Entity.find_all([], mock_client) + assert entities == {} + mock_client.read.assert_not_called() + mock_client.assert_not_called() + + def test_find_by_when_found(mocker, mock_client) -> None: mock_client.read.return_value = [{"id": 1, "name": "Test Entity"}] mocker.patch.object(Entity, "ENDPOINT", new="test_endpoint") diff --git a/tests/bfabric/entities/test_externaljob.py b/tests/bfabric/entities/test_externaljob.py new file mode 100644 index 00000000..9b24e2c0 --- /dev/null +++ b/tests/bfabric/entities/test_externaljob.py @@ -0,0 +1,33 @@ +from typing import Any + +import pytest +from pytest_mock import MockerFixture + +from bfabric.entities import ExternalJob, Workunit + + +@pytest.fixture() +def data_dict(): + return { + "id": 1, + "cliententityclassname": "Workunit", + "cliententityid": 5, + } + + +def test_workunit_when_available(mocker: MockerFixture, data_dict: dict[str, Any]): + mock_client = mocker.MagicMock(name="mock_client", spec=[]) + mock_find = mocker.patch.object(Workunit, "find") + external_job = ExternalJob(data_dict, mock_client) + assert external_job.workunit == mock_find.return_value + mock_find.assert_called_once_with(id=5, client=mock_client) + + +def test_workunit_when_wrong_class(mocker: MockerFixture, data_dict: dict[str, Any]): + mock_client = mocker.MagicMock(name="mock_client", spec=[]) + mock_find = mocker.patch.object(Workunit, "find") + # TODO actually check which ones are the legal values here + data_dict["cliententityclassname"] = "WrongClass" + external_job = ExternalJob(data_dict, mock_client) + assert external_job.workunit is None + mock_find.assert_not_called() diff --git a/tests/bfabric/entities/test_workunit.py b/tests/bfabric/entities/test_workunit.py index 0e09d46a..8798cd6f 100644 --- a/tests/bfabric/entities/test_workunit.py +++ b/tests/bfabric/entities/test_workunit.py @@ -98,7 +98,7 @@ def test_store_output_folder(mocker, mock_workunit) -> None: mock_application = mocker.MagicMock(storage={"projectfolderprefix": "xyz"}) mock_application.__getitem__.side_effect = {"technology": "tech", "name": "my app"}.__getitem__ mocker.patch.object(mock_workunit, "application", mock_application) - mocker.patch.object(mock_workunit, "container").id = 12 + mocker.patch.object(Workunit, "container", mocker.PropertyMock(return_value=mocker.MagicMock(id=12))) assert Path("xyz12/bfabric/tech/my_app/2024/2024-01/2024-01-02/workunit_30000") == mock_workunit.store_output_folder diff --git a/tests/bfabric/experimental/__init__.py b/tests/bfabric/experimental/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/bfabric/experimental/test_entity_lookup_cache.py b/tests/bfabric/experimental/test_entity_lookup_cache.py new file mode 100644 index 00000000..13daee1e --- /dev/null +++ b/tests/bfabric/experimental/test_entity_lookup_cache.py @@ -0,0 +1,94 @@ +import pytest + +from bfabric.experimental.entity_lookup_cache import Cache, EntityLookupCache + + +@pytest.fixture() +def max_size() -> int: + return 3 + + +@pytest.fixture() +def cache(max_size: int): + cache = Cache(max_size=max_size) + cache.put("key1", "value1") + cache.put("key2", "value2") + return cache + + +@pytest.fixture() +def entity_cache(max_size: int): + result = EntityLookupCache(max_size=max_size) + result.put("Entity1", 1, "value1") + result.put("Entity1", 2, "value2") + return result + + +def test_cache_get_when_exists(cache): + assert cache.get("key1") == "value1" + assert cache.get("key2") == "value2" + + +def test_cache_get_when_not_exists(cache): + assert cache.get("missing") is None + + +@pytest.mark.parametrize("max_size", [0, 3]) +def test_cache_put(cache, max_size): + cache.put("key3", "value3") + cache.put("key4", "value4") + if max_size == 3: + assert cache.get("key1") is None + else: + assert cache.get("key1") == "value1" + assert cache.get("key2") == "value2" + assert cache.get("key3") == "value3" + assert cache.get("key4") == "value4" + + +def test_cache_contains(cache): + assert "key1" in cache + assert "key2" in cache + assert "key3" not in cache + + +def test_entity_lookup_cache_contains(entity_cache): + assert entity_cache.contains("Entity1", 1) + assert entity_cache.contains("Entity1", 2) + assert not entity_cache.contains("Entity1", 3) + assert not entity_cache.contains("Entity2", 1) + + +def test_entity_lookup_cache_get_when_exists(entity_cache): + assert entity_cache.get("Entity1", 1) == "value1" + assert entity_cache.get("Entity1", 2) == "value2" + + +def test_entity_lookup_cache_get_when_not_exists(entity_cache): + assert entity_cache.get("Entity1", 3) is None + + +def test_entity_lookup_cache_get_all(entity_cache): + result = entity_cache.get_all("Entity1", [1, 2, 3]) + assert result == {1: "value1", 2: "value2"} + + +def test_entity_lookup_cache_put(entity_cache): + entity_cache.put("Entity1", 3, "value3") + entity_cache.put("Entity1", 4, "value4") + assert entity_cache.get("Entity1", 1) is None + assert entity_cache.get("Entity1", 2) == "value2" + assert entity_cache.get("Entity1", 3) == "value3" + assert entity_cache.get("Entity1", 4) == "value4" + + +def test_entity_lookup_cache_enable(entity_cache): + assert entity_cache.instance() is None + with entity_cache.enable(): + first_instance = entity_cache.instance() + assert first_instance is not None + with entity_cache.enable(): + second_instance = entity_cache.instance() + assert first_instance is second_instance + assert entity_cache.instance() is first_instance + assert entity_cache.instance() is None diff --git a/tests/bfabric/utils/test_math_helper.py b/tests/bfabric/utils/test_math_helper.py deleted file mode 100644 index 0f81be22..00000000 --- a/tests/bfabric/utils/test_math_helper.py +++ /dev/null @@ -1,15 +0,0 @@ -import unittest - -import bfabric.utils.math_helper as math_helper - - -class BfabricTestMath(unittest.TestCase): - def test_integer_division(self): - # Main purpose of dictionary sorting is that they appear consistent when printed - self.assertEqual(math_helper.div_int_ceil(120, 100), 2) - self.assertEqual(math_helper.div_int_ceil(200, 100), 2) - self.assertEqual(math_helper.div_int_ceil(245, 100), 3) - - -if __name__ == "__main__": - unittest.main(verbosity=2) diff --git a/tests/bfabric/wrapper_creator/__init__.py b/tests/bfabric/wrapper_creator/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/bfabric/wrapper_creator/test_slurm.py b/tests/bfabric/wrapper_creator/test_slurm.py new file mode 100644 index 00000000..3f375609 --- /dev/null +++ b/tests/bfabric/wrapper_creator/test_slurm.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +from pathlib import Path + +import pytest +from logot import Logot, logged +from pytest_mock import MockerFixture + +from bfabric.wrapper_creator.slurm import SLURM + + +@pytest.fixture() +def mock_slurm() -> SLURM: + return SLURM(slurm_root=Path("/tmp/test_slurm")) + + +@pytest.mark.parametrize("path", ["/tmp/hello/world.txt", Path("/tmp/hello/world.txt")]) +def test_sbatch_when_success(mocker: MockerFixture, mock_slurm: SLURM, path: Path | str) -> None: + mock_is_file = mocker.patch.object(Path, "is_file", return_value=True) + mocker.patch("os.environ", new={"x": "y"}) + mock_run = mocker.patch("subprocess.run", return_value=mocker.MagicMock(stdout="stdout", stderr="stderr")) + stdout, stderr = mock_slurm.sbatch(script=path) + assert stdout == "stdout" + assert stderr == "stderr" + mock_run.assert_called_once_with( + [Path("/tmp/test_slurm/bin/sbatch"), Path(path)], + env={"SLURMROOT": Path("/tmp/test_slurm"), "x": "y"}, + check=True, + shell=False, + capture_output=True, + encoding="utf-8", + ) + assert mock_is_file.call_count == 2 + + +def test_sbatch_when_script_not_exists(mocker: MockerFixture, mock_slurm: SLURM, logot: Logot) -> None: + mocker.patch("bfabric.wrapper_creator.slurm.Path", side_effect=lambda x: x) + mock_script = mocker.MagicMock(name="script", is_file=lambda: False) + result = mock_slurm.sbatch(script=mock_script) + assert result is None + logot.assert_logged(logged.error(f"Script not found: {mock_script}")) + + +def test_sbatch_when_sbatch_not_exists(mocker: MockerFixture, mock_slurm: SLURM, logot: Logot) -> None: + mocker.patch("bfabric.wrapper_creator.slurm.Path", side_effect=lambda x: x) + mock_script = mocker.MagicMock(name="script", is_file=lambda: True) + mock_sbatch = mocker.patch.object(mock_slurm, "_sbatch_bin", mocker.MagicMock(is_file=lambda: False)) + result = mock_slurm.sbatch(script=mock_script) + assert result is None + logot.assert_logged(logged.error(f"sbatch binary not found: {mock_sbatch}")) diff --git a/tests/bfabric_scripts/test_bfabric_slurm_queue_status.py b/tests/bfabric_scripts/test_bfabric_slurm_queue_status.py index dac45445..faf5b56d 100644 --- a/tests/bfabric_scripts/test_bfabric_slurm_queue_status.py +++ b/tests/bfabric_scripts/test_bfabric_slurm_queue_status.py @@ -2,8 +2,8 @@ import polars.testing import pytest -from bfabric.entities import Workunit -from bfabric_scripts.bfabric_slurm_queue_status import get_slurm_jobs, get_workunit_status +from bfabric.entities import Workunit, Application +from bfabric_scripts.bfabric_slurm_queue_status import get_slurm_jobs, get_workunit_infos @pytest.fixture @@ -32,16 +32,21 @@ def test_get_slurm_jobs_when_local(mocker, command_output): pl.testing.assert_frame_equal(df, expected_df) -def test_get_workunit_status(mocker): +def test_get_workunit_infos(mocker): mock_client = mocker.Mock(name="mock_client") - mock_find_all = mocker.patch.object(Workunit, "find_all") + mock_workunit_find_all = mocker.patch.object(Workunit, "find_all") workunit_ids = [5000, 5001] - mock_find_all.return_value = { - 5001: Workunit({"id": 5000, "status": "RUNNING"}), + mock_workunit_find_all.return_value = { + 5001: Workunit({"id": 5000, "status": "RUNNING", "application": {"id": 1}}), } - status = get_workunit_status(mock_client, workunit_ids) - assert status == {5000: "ZOMBIE", 5001: "RUNNING"} - mock_find_all.assert_called_once_with(ids=workunit_ids, client=mock_client) + mock_app_find_all = mocker.patch.object(Application, "find_all") + mock_app_find_all.return_value = {1: {"id": 1, "name": "myapp"}} + infos = get_workunit_infos(mock_client, workunit_ids) + assert infos == [ + {"workunit_id": 5000, "status": "ZOMBIE", "application_name": "N/A"}, + {"workunit_id": 5001, "status": "RUNNING", "application_name": "myapp"}, + ] + mock_workunit_find_all.assert_called_once_with(ids=workunit_ids, client=mock_client) if __name__ == "__main__":