diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index e1a56ea..619e2f2 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -1,164 +1,38 @@ -name: tests +name: Run tests on: - push: - tags: - - 'v[0-9]+.[0-9]+.[0-9]+' - - 'v[0-9]+.[0-9]+.[0-9]+a[0-9]+' - - 'v[0-9]+.[0-9]+.[0-9]+b[0-9]+' - - 'v[0-9]+.[0-9]+.[0-9]+rc[0-9]+' pull_request: - branches: - - '*' - workflow_dispatch: - inputs: - target: - description: "How much of the test suite to run" - type: choice - default: default - options: - - default - - full - - downstream - cache: - description: "Use cache" - type: boolean - default: true - schedule: - - cron: '0 15 * * SUN' - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true + push: + branches: [main] + paths-ignore: + - "docs/" + - "notebooks/" + - "scripts/" jobs: - pre_commit: - name: Run pre-commit - runs-on: 'ubuntu-latest' - steps: - - uses: holoviz-dev/holoviz_tasks/pre-commit@v0.1a19 - setup: - name: Setup workflow - runs-on: ubuntu-latest - outputs: - matrix: ${{ env.MATRIX }} - matrix_option: ${{ env.MATRIX_OPTION }} - steps: - - name: Set matrix option - run: | - if [[ '${{ github.event_name }}' == 'workflow_dispatch' ]]; then - OPTION=${{ github.event.inputs.target }} - elif [[ '${{ github.event_name }}' == 'schedule' ]]; then - OPTION="full" - elif [[ '${{ github.event_name }}' == 'push' && '${{ github.ref_type }}' == 'tag' ]]; then - OPTION="full" - else - OPTION="default" - fi - echo "MATRIX_OPTION=$OPTION" >> $GITHUB_ENV - - name: Set test matrix with 'default' option - if: env.MATRIX_OPTION == 'default' - run: | - MATRIX=$(jq -nsc '{ - "os": ["ubuntu-latest", "macos-latest", "windows-latest"], - "python-version": ["3.10", "3.12"], - "exclude": [ - { - "python-version": "3.10", - "os": "macos-latest" - } - ] - }') - echo "MATRIX=$MATRIX" >> $GITHUB_ENV - - name: Set test matrix with 'full' option - if: env.MATRIX_OPTION == 'full' - run: | - MATRIX=$(jq -nsc '{ - "os": ["ubuntu-latest", "macos-latest", "windows-latest"], - "python-version": ["3.10", "3.12"], - "include": [ - { - "python-version": "3.10", - "os": "ubuntu-latest" - }, - { - "python-version": "3.11", - "os": "ubuntu-latest" - }, - , - { - "python-version": "3.12", - "os": "ubuntu-latest" - } - ] - }') - echo "MATRIX=$MATRIX" >> $GITHUB_ENV - - name: Set test matrix with 'downstream' option - if: env.MATRIX_OPTION == 'downstream' - run: | - MATRIX=$(jq -nsc '{ - "os": ["ubuntu-latest"], - "python-version": ["3.12"] - }') - echo "MATRIX=$MATRIX" >> $GITHUB_ENV - - conda_suite: - name: conda tests:${{ matrix.os }}:${{ matrix.python-version }} - needs: [pre_commit, setup] - if: needs.setup.outputs.matrix_option != 'default' - runs-on: ${{ matrix.os }} + test: strategy: - fail-fast: false - matrix: ${{ fromJson(needs.setup.outputs.matrix) }} - timeout-minutes: 90 - defaults: - run: - shell: bash -el {0} - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - uses: conda-incubator/setup-miniconda@v3 - with: - auto-update-conda: true - environment-file: envs/py${{ matrix.python-version }}-tests.yaml - activate-environment: holoseqtests - - name: conda info - run: conda info - - name: conda list - run: conda list - - name: unit tests - run: pytest -v holoseq --cov=holoseq --cov-append - pip_test: - name: pip tests:${{ matrix.os }}:${{ matrix.python-version }} - needs: [pre_commit, setup] - timeout-minutes: 90 + matrix: + os: [ubuntu-latest] + version: ["3.10", "3.11", "3.12"] + runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: ${{ fromJson(needs.setup.outputs.matrix) }} - defaults: - run: - shell: bash -e {0} steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - uses: actions/setup-python@v5 + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.version }} + uses: actions/setup-python@v5 with: - python-version: ${{ matrix.python-version }} - - name: install with geo - run: python -m pip install -v --prefer-binary -e '.[tests, examples-tests, geo, hvdev, hvdev-geo, dev-extras]' - - name: python version and pip list + python-version: ${{ matrix.version }} + + - name: Install dependencies run: | - python --version --version - python -m pip list - - name: unit tests - run: pytest -v hvplot --cov=hvplot --cov-append - - name: Upload coverage reports to Codecov - if: github.event_name == 'push' || github.event_name == 'pull_request' - uses: codecov/codecov-action@v4 - with: - fail_ci_if_error: false - verbose: false - env: - CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} \ No newline at end of file + python -m pip install --upgrade pip + pip install .[dev,test] + + - name: Run ruff + uses: astral-sh/ruff-action@v2 + + - name: Run tests + run: | + pytest -v tests --cov=src/holoseq --cov-append diff --git a/.gitignore b/.gitignore index 82f9275..55f700b 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ share/python-wheels/ .installed.cfg *.egg MANIFEST +environment.yaml # PyInstaller # Usually these files are written by a python script from a template diff --git a/README.md b/README.md index 66c52be..fd12c9d 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ down to individual points and back.* This is new work in progress. Development started in late October 2024. A draft framework -[description and specification is here.](https://github.com/holoviz-topics/holoSeq/blob/main/HoloSeqOverview.md). +[description and specification is here.](https://github.com/holoviz-topics/holoSeq/blob/main/docs/HoloSeqOverview.md) ## Core idea: Features on intervals arranged along linear axes for browsing @@ -35,7 +35,7 @@ and plotted using rasterize and datashader, with each tap converted into contig in an IPython notebook, or if the dependencies are available, can be served from this repository's root, as: -`panel serve holoSeq_random.py --show` +`panel serve scripts/holoSeq_random.py --show` Edit the default 10000 xmax value to get a sense of scale capacity - 10M is not a problem. There is very little code needed for plotting. Most of the code is needed to create some sample contigs of @@ -196,15 +196,15 @@ the plot. - Only pairs involving H1 contigs (H1 cis) are used in the demonstration. Briefly, the framework creates the -[minimum data required](https://github.com/holoviz-topics/holoSeq/blob/main/HoloSeqOverview.md) to create a -plot. A genome lengths file is required, and the named contigs can be reordered by name or length. -The axes are defined by the ordering. The lengths are cumulated to give an offset to the first -nucleotide of each contig, so the track can be read and feature locations converted into the plot -coordinate system, and stored as a compressed intermediate file. The display application reads these -pre-computed plot coordinate files, with enough metadata about the reference sequence to add tic -marks to the axes and to back-calculate the stream of user tap coordinates. A converter for PAF to -compressed hseq format for input is available and was used to generate the demonstration. Bigwig is -working and other common genomic annotation formats, such as gff and vcf will follow. +[minimum data required](https://github.com/holoviz-topics/holoSeq/blob/main/docs/HoloSeqOverview.md) +to create a plot. A genome lengths file is required, and the named contigs can be reordered by name +or length. The axes are defined by the ordering. The lengths are cumulated to give an offset to the +first nucleotide of each contig, so the track can be read and feature locations converted into the +plot coordinate system, and stored as a compressed intermediate file. The display application reads +these pre-computed plot coordinate files, with enough metadata about the reference sequence to add +tic marks to the axes and to back-calculate the stream of user tap coordinates. A converter for PAF +to compressed hseq format for input is available and was used to generate the demonstration. Bigwig +is working and other common genomic annotation formats, such as gff and vcf will follow. Multiple input files will produce a stack of plots that work independently: @@ -255,8 +255,8 @@ else ``` This repository includes a python script conversion utility for PAF inputs, -`scripts/holoSeq_prepare_paf.py`, that works with the awk PAF output and converts it into a compressed -coordinate file. The compressed demonstration plotting data were prepared using: +`scripts/holoSeq_prepare_paf.py`, that works with the awk PAF output and converts it into a +compressed coordinate file. The compressed demonstration plotting data were prepared using: ```bash python scripts/holoSeq_prepare_paf.py \ @@ -393,19 +393,31 @@ cd holoSeq ``` Create a virtual environment using your favorite method, _e.g._ `conda`, `venv`, `poetry`, `pixi` -_etc_. We will use `conda` as an example. +_etc_. We will use `conda` as an example. No `conda` environment file is supplied within the repo, +however, we can generate one using +[`pyproject2conda`](https://github.com/usnistgov/pyproject2conda%60). See `pyproject2conda` on how +to install it in your system. ```bash +pyproject2conda yaml --file pyproject.toml \ + --no-header \ + --name holoseq-dev \ + --channel conda-forge \ + --python-include infer \ + --extra dev --extra notebooks --extra test \ + --output environment.yaml conda create env --file environment.yaml conda activate holoseq-dev ``` Next install `holoSeq` into the virtual environment, and install the pre-commit hooks. If you would -like to contribute to work with Jupyter notebooks, install `notebooks` along with the `dev` and -`tests` flags. +like to contribute to work with Jupyter notebooks, be sure to install `notebooks` in the `pip` +command. ```bash -pip install --editable .[dev,test] # Include notebooks if you would like to install Juptyer. +pip install --editable . +# For installing notebooks development as well, use the following command instead of the one above. +#pip install --editable .[notebooks] pre-commit install ``` diff --git a/docs/HoloSeqOverview.md b/docs/HoloSeqOverview.md index 69e9f63..00d4f92 100644 --- a/docs/HoloSeqOverview.md +++ b/docs/HoloSeqOverview.md @@ -1,72 +1,86 @@ # HoloSeq +Internally, this project uses a *precomputed mapping data format* for sequence annotation that +allows large scale data to be viewed -Internally, this project uses a *precomputed mapping data format* for sequence annotation that allows large scale data to be viewed +as 1D charts or 2D heatmaps using a generic visualisation infrastructure built using the +[Holoviews ecosystem](https://holoviews.org/). -as 1D charts or 2D heatmaps using a generic visualisation infrastructure built using the [Holoviews ecosystem](https://holoviews.org/). +Preparing the coordinates for a 60GB HiC PAF file with 720 million pairs takes a couple of hours so +it makes sense to save them in an intermediate precomputed format. Displaying the resulting 1.2GB +compressed coordinates for 200M pairs only involving H1 takes about 10 minutes in comparison. -Preparing the coordinates for a 60GB HiC PAF file with 720 million pairs takes a couple of hours so it makes sense to save them in -an intermediate precomputed format. Displaying the resulting 1.2GB compressed coordinates for 200M pairs only involving H1 takes about 10 minutes in comparison. +The presentation layer supports genome scale feature data associated with a genomic reference or +other sequence. The user can pan and zoom smoothly from whole genomes down to individual points, +with tens of millions of rows of data, in a web browser running on a suitable laptop or in Galaxy. +The data format provides all the information needed for recreating a plot using the inbuilt +reference sequence coordinates as axes. -The presentation layer supports genome scale feature data associated with a genomic reference or other sequence. The user can pan and zoom -smoothly from whole genomes down to individual points, with tens of millions of rows of data, in a web browser running on a suitable laptop or in Galaxy. -The data format provides all the information needed for recreating a plot using the inbuilt reference sequence coordinates as axes. +The design isolates the complexities of displaying many different kinds of annotation at genomic +scale, from the messy challenges of converting complex existing data in standard formats. The +intention is to allow any number of precomputed track coordinate files to be supplied to the generic +display application, where they are automatically organised and displayed, using hints on layout +supplied on the command line. -The design isolates the complexities of displaying many different kinds of annotation at genomic scale, from the messy challenges of converting -complex existing data in standard formats. The intention is to allow any number of precomputed track coordinate files to be supplied to the generic display -application, where they are automatically organised and displayed, using hints on layout supplied on the command line. +The main use case envisioned is a central repository of pre-computed plots to make annotation of the +VGP genomic data easily accessible. Precomputed plot tracks can be reused indefinitely, mixed and +matched by the user to suit their needs. -The main use case envisioned is a central repository of pre-computed plots to make annotation of the VGP genomic data easily accessible. -Precomputed plot tracks can be reused indefinitely, mixed and matched by the user to suit their needs. - -Each species has different coordinate systems so cannot share a reference sequence axis, but can each be displayed in -tracks with separate reference sequences side by side or stacked. +Each species has different coordinate systems so cannot share a reference sequence axis, but can +each be displayed in tracks with separate reference sequences side by side or stacked. ### Potential sources of annotation for display -HiC data in PAF format was used for the proof of concept 2D heatmaps. Mashmap approximate mapping PAF works well. +HiC data in PAF format was used for the proof of concept 2D heatmaps. Mashmap approximate mapping +PAF works well. Bigwig, bed and GFF are the major formats for 1D annotation tracks. ### Coordinate system -Genomic reference data forms the backbone for any annotation browser. -New genomes are assembled into multiple "contigs", that are refined into chromosomes in reference genomes. -Genomes are typically handled in fasta format. A newly assembled haplotype may have thousands of contigs -that have not yet been merged into chromosomes. Contig names must be unique to each genome or haplotype. +Genomic reference data forms the backbone for any annotation browser. New genomes are assembled into +multiple "contigs", that are refined into chromosomes in reference genomes. Genomes are typically +handled in fasta format. A newly assembled haplotype may have thousands of contigs that have not yet +been merged into chromosomes. Contig names must be unique to each genome or haplotype. -For an interactive genome browser, tracks typically run horizontally, from the start of the reference sequence on the left to the last nucleotide of the last contig. +For an interactive genome browser, tracks typically run horizontally, from the start of the +reference sequence on the left to the last nucleotide of the last contig. -Holoviews dynamic maps require ordinal axis coordinates. Contigs must be ordered on the axes, usually by name or by length, -so they can be mapped as axes, with tick marks and labels. +Holoviews dynamic maps require ordinal axis coordinates. Contigs must be ordered on the axes, +usually by name or by length, so they can be mapped as axes, with tick marks and labels. -Features must have a position to locate them in the reference sequence used to create the axis. -Position is unambiguously described by the name of the contig, and the number of bases from the start of the contig to the start of the feature. -Some features have a length while others for 2D grids are points. Many features may have optional annotation. +Features must have a position to locate them in the reference sequence used to create the axis. +Position is unambiguously described by the name of the contig, and the number of bases from the +start of the contig to the start of the feature. Some features have a length while others for 2D +grids are points. Many features may have optional annotation. -To convert feature positions into plot axis coordinates, contig lengths are cumulated in the order given, and a zero is inserted, -to give the ordinal position on the axis, for the first base of each ordered contig. +To convert feature positions into plot axis coordinates, contig lengths are cumulated in the order +given, and a zero is inserted, to give the ordinal position on the axis, for the first base of each +ordered contig. -When a feature is mapped, the appropriate contig's cumulated start is added to the feature offset, to give the -ordinal start coordinate on that axis for the start of the feature. +When a feature is mapped, the appropriate contig's cumulated start is added to the feature offset, +to give the ordinal start coordinate on that axis for the start of the feature. Additional annotation values may be optionally displayed as hover tooltips. ## Input format for 1D and 2D features on pre-mapped axis coordinates -The converters produce gzip compressed text files. +The converters produce gzip compressed text files. The text file must start with a header section, where every row begins with `@`. -The first row of the must be either `@v1HoloSeq1D [bar|scatter|line]` or `@v1HoloSeq2D`, or the data will not be processed. +The first row of the must be either `@v1HoloSeq1D [bar|scatter|line]` or `@v1HoloSeq2D`, or the data +will not be processed. -For 1D data, the chart type may be one of `bar`, `scatter` or `line`. Default is `bar`. Regions with 4 or more SD above or below the global mean are -emphasised +For 1D data, the chart type may be one of `bar`, `scatter` or `line`. Default is `bar`. Regions with +4 or more SD above or below the global mean are emphasised -2D data will be presented as an autoscaling density heatmap. The header and data might only have 1 axis name, for example where HiC pairs from one haplotype are plotted -with that sequence on both axes, or 2 axis names, if HiC pairs involving both haplotypes, one on each axis, are being plotted. +2D data will be presented as an autoscaling density heatmap. The header and data might only have 1 +axis name, for example where HiC pairs from one haplotype are plotted with that sequence on both +axes, or 2 axis names, if HiC pairs involving both haplotypes, one on each axis, are being plotted. -The subsequent header rows must have the plot title, plot type, axis names, contig names and their cumulated lengths, delimited by whitespace, and starting with `@` such as +The subsequent header rows must have the plot title, plot type, axis names, contig names and their +cumulated lengths, delimited by whitespace, and starting with `@` such as ``` @v1HoloSeq1D bar @@ -82,38 +96,39 @@ In this case, H1 will show four chromosomes starting at each of the positions sh Metadata such as the name of the chromosome lengths file and the plot title, is prefixed with `@@` +Data rows for 1D data must have the x ordinal axis coordinate, a feature length, and an annotation +value to show for that length, such as: `2455453443 128 99.8` -Data rows for 1D data must have the x ordinal axis coordinate, a feature length, and an annotation value to show for that length, such as: -`2455453443 128 99.8` - -Data rows for 2D data must have the x and y ordinal grid coordinates that correspond to the contigs in the header, and -may have a heatmap value or other annotation such as: +Data rows for 2D data must have the x and y ordinal grid coordinates that correspond to the contigs +in the header, and may have a heatmap value or other annotation such as: `3999543 58898548 2` -If there are multiple rows at the same coordinates without annotation, the count is used to provide heatmap values. +If there are multiple rows at the same coordinates without annotation, the count is used to provide +heatmap values. -The header allows the cumulated start axis coordinate of each ordered contig to be recovered, and used to create the axis tick marks -corresponding to the start of each contig. +The header allows the cumulated start axis coordinate of each ordered contig to be recovered, and +used to create the axis tick marks corresponding to the start of each contig. ## Visualisation -Input files with identical contig names are grouped into a vertical stack with linked axes. -For each of these stacks, the header contig/length values are used to prepare the axis tick marks. +Input files with identical contig names are grouped into a vertical stack with linked axes. For each +of these stacks, the header contig/length values are used to prepare the axis tick marks. -For each 1D input, the chart type is prepared as a panel row. -2D inputs are turned into datamaps with optional scale bars and tap for location coordinates. +For each 1D input, the chart type is prepared as a panel row. 2D inputs are turned into datamaps +with optional scale bars and tap for location coordinates. Groups of tracks are stacked into a panel to be served. -All plots have optional scale bars for reference and a mouse click anywhere on a plot will show the coordinates above the plot. -Tap coordinates are calculated from a stream giving the tap x and y coordinates, using a binary search on the contig coordinate starts. +All plots have optional scale bars for reference and a mouse click anywhere on a plot will show the +coordinates above the plot. Tap coordinates are calculated from a stream giving the tap x and y +coordinates, using a binary search on the contig coordinate starts. ## Deployment -A visualisation can be run locally and viewed in a desktop browser by stacking any number of compressed coordinate hseq -files +A visualisation can be run locally and viewed in a desktop browser by stacking any number of +compressed coordinate hseq files -`panel serve holoseq_display.py --show --args --inFile foo.gz bar.gz baz.gz zot.gz --title my holoseq plot` +`panel serve scripts/holoseq_display.py --show --args --inFile foo.gz bar.gz baz.gz zot.gz --title my holoseq plot` The Dockerfile is a work in progress as part of an interactive Galaxy tool. diff --git a/environment.yaml b/environment.yaml deleted file mode 100644 index d6c3ff6..0000000 --- a/environment.yaml +++ /dev/null @@ -1,18 +0,0 @@ -name: holoseq-dev -channels: - - conda-forge - - pyviz/label/dev -dependencies: - - python >3.10,<3.13 - - bokeh - - dask[dataframe] - - datashader - - holoviews - - pre-commit - - ruff - - jupyterlab - - pytest - - pytest-cov - - nbsite>=0.8.4,<0.9.0 - - setuptools - - setuptools-scm \ No newline at end of file diff --git a/envs/py3.10-tests.yaml b/envs/py3.10-tests.yaml deleted file mode 100644 index 4a8ff21..0000000 --- a/envs/py3.10-tests.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# -# This file is autogenerated by pyproject2conda -# with the following command: -# -# pyproject2conda project --overwrite force --template-python envs/py{py_version}-{env} -# -# You should not manually edit this file. -# Instead edit the corresponding pyproject.toml file. -# -name: holoseqtests -channels: - - nodefaults - - pyviz/label/dev - - conda-forge -dependencies: - - python=3.10 - - bokeh - - dask - - datashader - - holoviews - - jupyterlab - - pre-commit - - pytest - - pytest-cov - - ruff diff --git a/envs/py3.11-tests.yaml b/envs/py3.11-tests.yaml deleted file mode 100644 index 3372863..0000000 --- a/envs/py3.11-tests.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# -# This file is autogenerated by pyproject2conda -# with the following command: -# -# pyproject2conda project --overwrite force --template-python envs/py{py_version}-{env} -# -# You should not manually edit this file. -# Instead edit the corresponding pyproject.toml file. -# -name: holoseqtests -channels: - - nodefaults - - pyviz/label/dev - - conda-forge -dependencies: - - python=3.11 - - bokeh - - dask - - datashader - - holoviews - - jupyterlab - - pre-commit - - pytest - - pytest-cov - - ruff diff --git a/envs/py3.12-tests.yaml b/envs/py3.12-tests.yaml deleted file mode 100644 index e47a653..0000000 --- a/envs/py3.12-tests.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# -# This file is autogenerated by pyproject2conda -# with the following command: -# -# pyproject2conda project --overwrite force --template-python envs/py{py_version}-{env} -# -# You should not manually edit this file. -# Instead edit the corresponding pyproject.toml file. -# -name: holoseqtests -channels: - - nodefaults - - pyviz/label/dev - - conda-forge -dependencies: - - python=3.12 - - bokeh - - dask - - datashader - - holoviews - - jupyterlab - - pre-commit - - pytest - - pytest-cov - - ruff diff --git a/pyproject.toml b/pyproject.toml index 16a6e95..74cdf5c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,12 +51,4 @@ version = {attr = "holoseq.__version__"} line-length = 100 [tool.ruff.lint.flake8-tidy-imports] -ban-relative-imports = "all" # Not not allow relative imports. - -[tool.pyproject2conda.dependencies] - -[tool.pyproject2conda.envs."tests"] -channels = ["nodefaults", "pyviz/label/dev", "conda-forge"] -python = ["3.10", "3.11", "3.12"] -extras = ["test", "notebooks", "dev"] -name = "holoseqtests" \ No newline at end of file +ban-relative-imports = "all" # Do not allow relative imports. diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 59f392e..0000000 --- a/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -datashader -dask[dataframe] -holoviews[recommended] -pandas -bokeh \ No newline at end of file diff --git a/scripts/hapsHiCpaf.py b/scripts/hapsHiCpaf.py index 2bff3dd..62b3b91 100644 --- a/scripts/hapsHiCpaf.py +++ b/scripts/hapsHiCpaf.py @@ -16,26 +16,21 @@ # This holoviews application is mostly monolithic because it cannot easily be # split up without passing lots of parameters AFAIK. # it works. be happy. +import math from bisect import bisect_left from collections import OrderedDict from functools import cmp_to_key -import math -import numpy as np import holoviews as hv -import panel as pn +import numpy as np import pandas as pd - -from holoviews.operation.datashader import ( - rasterize, - dynspread, -) -from holoviews.operation.resample import ResampleOperation2D +import panel as pn from holoviews.operation import decimate +from holoviews.operation.datashader import dynspread, rasterize +from holoviews.operation.resample import ResampleOperation2D useDecimate = False # will rasterize instead -# inFile = "galaxy_inputs/paf/bothmap.paf.tab.tabular" -inFile = "/home/ross/rossgit/holoviews-examples/huge.paf" +inFile = "" ptwidth = 1000 pcwidth = 800 # width settings for plots and location bars diff --git a/scripts/holoSeq_prepare_paf.py b/scripts/holoSeq_prepare_paf.py index 0865828..4301e84 100644 --- a/scripts/holoSeq_prepare_paf.py +++ b/scripts/holoSeq_prepare_paf.py @@ -27,30 +27,24 @@ # Ross Lazarus October 2024 import argparse - -from collections import OrderedDict -from functools import cmp_to_key -from pathlib import Path - import gzip import io import itertools import logging import math -import numpy as np - -import re import os +import re +from collections import OrderedDict +from functools import cmp_to_key +from pathlib import Path +import numpy as np import pandas as pd import pybigtools logging.basicConfig(level=logging.DEBUG) log = logging.getLogger("holoseq_prepare") - -# inFile = "galaxy_inputs/paf/bothmap.paf.tab.tabular" -inFile = "/home/ross/rossgit/holoviews-examples/huge.paf" - +inFile = "" holoSeqHeaders = ["@v1HoloSeq1D", "@v1HoloSeq2D"] @@ -257,11 +251,7 @@ def __init__(self, gff, outFname, contigs, args): segs[id] = [] if kind.lower() in ["cds", "mrna"]: anno = text.split(";") - tanno = [ - x.strip()[7:] - for x in anno - if x.lower().startswith("target=") - ] + tanno = [x.strip()[7:] for x in anno if x.lower().startswith("target=")] target = tanno[0] startp = int(startp) endp = int(endp) @@ -278,10 +268,7 @@ def __init__(self, gff, outFname, contigs, args): if kind.lower() == "mrna": if target: if mrnaseen.get(target, None): - log.debug( - "Seeing mrna target %s again at row %d" - % (target, i) - ) + log.debug("Seeing mrna target %s again at row %d" % (target, i)) else: mrnaseen[target] = target segs[id].append( @@ -413,9 +400,7 @@ def __init__(self, inFname, outFname, args, contigs): bw = bwf.records(bchr) data[cchr]["xval"] = [x[2] for x in bw] else: - log.warn( - "Bigwig contig %s not found in supplied X axis lengths file" % cchr - ) + log.warn("Bigwig contig %s not found in supplied X axis lengths file" % cchr) self.export_mapping(outFname, contigs, data, args) def export_mapping(self, outFname, contigs, data, args): @@ -447,9 +432,7 @@ def prepHeader(contigs, args): ofn.write(str.encode("\n".join(hdr) + "\n")) for chr in data.keys(): for i in range(len(data[chr]["xstart"])): - row = str.encode( - "%d %d\n" % (data[chr]["xstart"][i], data[chr]["xval"][i]) - ) + row = str.encode("%d %d\n" % (data[chr]["xstart"][i], data[chr]["xval"][i])) ofn.write(row) @@ -565,9 +548,7 @@ def prepHeader( """ h = ["@%s %s %d" % (getHap(k), k, xcontigs[k]) for k in xcontigs.keys()] if len(haps) > 1: - h += [ - "@%s %s %d" % (getHap(k), k, ycontigs[k]) for k in ycontigs.keys() - ] + h += ["@%s %s %d" % (getHap(k), k, ycontigs[k]) for k in ycontigs.keys()] metah = [ hsId, "@@heatmap", @@ -651,12 +632,8 @@ def prepHeader( help="Optional Y axis contig names and lengths, whitespace delimited for different reference sequences", required=False, ) - parser.add_argument( - "--title", help="Title for the plot", default="Plot title goes here" - ) - parser.add_argument( - "--contig_sort", help="VGPname, name, length, none", default="length" - ) + parser.add_argument("--title", help="Title for the plot", default="Plot title goes here") + parser.add_argument("--contig_sort", help="VGPname, name, length, none", default="length") parser.add_argument( "--refURI", help="URI for the genome reference sequence used for the coordinates for metadata", diff --git a/scripts/holoSeq_random.py b/scripts/holoSeq_random.py index ef328c6..65d1001 100644 --- a/scripts/holoSeq_random.py +++ b/scripts/holoSeq_random.py @@ -1,4 +1,4 @@ -# see https://github.com/fubar2/holoSeq +# see https://github.com/holoviz-topics/holoSeq # illustrates some of the basic ideas in converting # a set of features that have been mapped to a genome into a # linear or in this case 2D display. diff --git a/scripts/holoseq_display.py b/scripts/holoseq_display.py index c0184bf..1690774 100644 --- a/scripts/holoseq_display.py +++ b/scripts/holoseq_display.py @@ -1,6 +1,6 @@ # ruff: noqa -# see https://github.com/fubar2/holoSeq +# see https://github.com/holoviz-topics/holoSeq # pip install datashader dask[dataframe] holoviews[recommended] pandas matplotlib bokeh # # panel serve --address 0.0.0.0 --port 8080 --show --session-token-expiration 9999999 --args --inFile ../hg002_bothHiC.paf_cisH1_hseq.gz @@ -15,31 +15,23 @@ # Ross Lazarus October 2024 import argparse -from bisect import bisect_left -from collections import OrderedDict import gzip import logging -import numpy as np import os +from bisect import bisect_left +from collections import OrderedDict import holoviews as hv +import numpy as np import pandas as pd import panel as pn - - -from holoviews.operation.datashader import ( - rasterize, - dynspread, -) +from holoviews.operation import decimate +from holoviews.operation.datashader import dynspread, rasterize from holoviews.operation.element import apply_when from holoviews.operation.resample import ResampleOperation2D -from holoviews.operation import decimate - logging.basicConfig(level=logging.DEBUG) log = logging.getLogger("holoseq_display") - - hv.extension("bokeh", "matplotlib", width=100) # Default values suitable for this notebook @@ -50,8 +42,7 @@ ResampleOperation2D.height = 250 -# inFile = "galaxy_inputs/paf/bothmap.paf.tab.tabular" -inFile = "/home/ross/rossgit/holoviews-examples/holoSeqtest.gz" +inFile = "" holoSeqHeaders = ["@v1HoloSeq1D", "@v1HoloSeq2D"] hv.extension("bokeh") pn.extension() @@ -81,7 +72,7 @@ def xportHtml(self, fname, hObj): def import_holoSeq_data(self, inFile): """ reverse process of dumping the data in holoSeq format from a converter - see https://github.com/fubar2/holoSeq/blob/main/HoloSeqOverview.md + see https://github.com/holoviz-topics/holoSeq/blob/main/docs/HoloSeqOverview.md """ haps = {} hh = [] @@ -100,9 +91,7 @@ def import_holoSeq_data(self, inFile): log.warn( f"Supplied input {inFile} has first row {trow} so is not a valid holoSeq input file" ) - log.warn( - "First row must start with one of these:%s" % holoSeqHeaders - ) + log.warn("First row must start with one of these:%s" % holoSeqHeaders) return hsDims = holoSeqHeaders.index(hseqformat) + 1 if hsDims == 1: @@ -617,12 +606,10 @@ def showX(x, y): parser.add_argument( "--inFile", help="gzipped hseq coordinates and contigs", - default="mUroPar1_cis1.hseq.gz", + default="data/mUroPar1_cis1.hseq.gz", nargs="+", ) -parser.add_argument( - "--size", help="Display size in pixels. Default is 800", default=1000 -) +parser.add_argument("--size", help="Display size in pixels. Default is 800", default=1000) parser.add_argument("--version", "-V", action="version", version="0.1") args = parser.parse_args() pwidth = int(args.size) diff --git a/src/holoseq/exceptions.py b/src/holoseq/exceptions.py index 4748254..c9f409a 100644 --- a/src/holoseq/exceptions.py +++ b/src/holoseq/exceptions.py @@ -1,4 +1,4 @@ - class HoloSeqFormatError(Exception): - """Raised when the HoloSeq data format is invalid or corrupted.""" - pass \ No newline at end of file + """Raised when the HoloSeq data format is invalid, or corrupted.""" + + pass diff --git a/tests/test_basic.py b/tests/test_basic.py deleted file mode 100644 index 8a8a5ec..0000000 --- a/tests/test_basic.py +++ /dev/null @@ -1,4 +0,0 @@ -def test_data_loader(): - from holoseq.data import load - path = "data/mUroPar1H1H2.paf_cisH1_hseq.gz" - assert load(path)[0] is not None \ No newline at end of file diff --git a/tests/test_data.py b/tests/test_data.py new file mode 100644 index 0000000..f564490 --- /dev/null +++ b/tests/test_data.py @@ -0,0 +1,11 @@ +from pathlib import Path + +from holoseq.data import load + +TEST_DIR = Path(__file__).parent.resolve() +DATA_DIR = TEST_DIR.joinpath("../data") + + +def test_load(): + path = DATA_DIR / "mUroPar1H1H2.paf_cisH1_hseq.gz" + assert load(path)[0] is not None