diff --git a/.github/workflows/documentation.yaml b/.github/workflows/documentation.yaml index 740fb949..8566e305 100644 --- a/.github/workflows/documentation.yaml +++ b/.github/workflows/documentation.yaml @@ -1,35 +1,26 @@ -name: Sphinx Documentation +name: mkdocs-generation on: push: - branches: [ main ] - + branches: + - master + - main +permissions: + contents: write jobs: - build-docs: + deploy: runs-on: ubuntu-latest steps: - - name: Checkout - uses: actions/checkout@v3 + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 with: - ref: ${{ github.ref }} - - - name: Set up Python 3. - uses: actions/setup-python@v3 + python-version: 3.x + - uses: actions/cache@v2 with: - python-version: 3.9 - - - name: install - run: pip3 install sphinx sphinx-rtd-theme click ddt pandas setuptools - - - name: Build documentation. - run: | - cd docs/ - sphinx-build -b html . _build - touch _build/.nojekyll - - - name: Deploy documentation. - if: ${{ github.event_name == 'push' }} - uses: JamesIves/github-pages-deploy-action@v4.3.0 - with: - branch: gh-pages - clean: true - folder: docs/_build + key: ${{ github.ref }} + path: .cache + - run: pip install mkdocs-material + - run: pip install mkdocs-material[imaging] + - run: pip install pillow cairosvg + - run: pip install mkdocs-material-extensions + - run: pip install mkdocstrings[python] + - run: mkdocs gh-deploy --force \ No newline at end of file diff --git a/README.md b/README.md index 8c3f5684..a1dc5e43 100644 --- a/README.md +++ b/README.md @@ -9,8 +9,8 @@ Releases page. ## Background [Gene Ontology](http://geneontology.org/) traditionally has provided -annotations for genes rather than for specific isoforms. However, in -some cases, the functions of the individual isoforms of a gene are +annotations for genes rather than for specific isoforms. However, in +some cases, the functions of the individual isoforms of a gene are differ with respect to one or more of the gene's function. In this project, we have developed an algorithm for prediction of isoform-specific function across the entire transcriptome. The isopret app offers a number of @@ -33,13 +33,13 @@ Macintosh laptops with the new ARM M1 chip when using Oracle's SDK (version 17.0 have used [Azul Zulu](https://www.azul.com/downloads/?package=jdk) JDKs on M1 Macintoshes and could run Isopret and other JavaFX apps without problems. -We offer pre-built installation files for MacIntosh (M1 and Intel) in the Release section. +We offer pre-built installation files for MacIntosh (M1 and Intel) in the Release section. ## Note to Linux users The easiest way to run isopret-gui on a linux system is to run the downloadable JAR file from the releases page. -```aidl +```bash java -jar isopret-gui.jar ``` @@ -53,4 +53,22 @@ A manuscript is in preparation. +## Set up documentation + +Enter the following code to install mkdocs and run a server locally. The GitHub action will create a comparable site online. + +``` +python3 -m venv venv +source venv/bin/activate +pip install mkdocs-material +pip install mkdocs-material[imaging] +pip install mkdocs-material-extensions +pip install pillow cairosvg +pip install mkdocstrings[python] +mkdocs serve +``` + + + + diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index 0bf45d59..00000000 --- a/docs/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = python -msphinx -SPHINXPROJ = isopret -SOURCEDIR = . -BUILDDIR = _build -html_static_path = ['..'] - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/_static/isopret.css b/docs/_static/isopret.css deleted file mode 100644 index faea178e..00000000 --- a/docs/_static/isopret.css +++ /dev/null @@ -1,3 +0,0 @@ -.wy-nav-content { - max-width: 75% !important; -} \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100644 index 11f28dcd..00000000 --- a/docs/conf.py +++ /dev/null @@ -1,176 +0,0 @@ -# -*- coding: utf-8 -*- -# -# VPV documentation build configuration file, created by -# sphinx-quickstart on Sun Sep 24 12:02:05 2017. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) - - -# -- General configuration ------------------------------------------------ - -# If your documentation needs a minimal Sphinx version, state it here. -# -# needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# -# source_suffix = ['.rst', '.md'] -source_suffix = '.rst' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = u'isopret' -copyright = u'2022' -author = u'Peter Robinson' - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -version = u'0.9' -# The full version, including alpha/beta/rc tags. -release = u'0.9' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = None - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# If true, `todo` and `todoList` produce output, else they produce nothing. -todo_include_todos = False - - -# -- Options for HTML output ---------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = "sphinx_rtd_theme" - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -# -# html_theme_options = {} - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] -#html_style = 'css/isopret.css' -html_css_files = ['isopret.css'] - -#def setup(app): -# app.add_css_file('default.css') - -# Custom sidebar templates, must be a dictionary that maps document names -# to template names. -# -# This is required for the alabaster theme -# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars -html_sidebars = { - '**': [ - 'about.html', - 'navigation.html', - 'relations.html', # needs 'show_related': True theme option to display - 'searchbox.html', - 'donate.html', - ] -} - - -# -- Options for HTMLHelp output ------------------------------------------ - -# Output file base name for HTML help builder. -htmlhelp_basename = 'isopret' - - -# -- Options for LaTeX output --------------------------------------------- - -latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # - # 'papersize': 'letterpaper', - - # The font size ('10pt', '11pt' or '12pt'). - # - # 'pointsize': '10pt', - - # Additional stuff for the LaTeX preamble. - # - # 'preamble': '', - - # Latex figure (float) alignment - # - # 'figure_align': 'htbp', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - (master_doc, 'isopret.tex', u'isopret Documentation', - u'Peter Robinson', 'manual'), -] - - -# -- Options for manual page output --------------------------------------- - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'isopret', u'isopret Documentation', - [author], 1) -] - - -# -- Options for Texinfo output ------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - (master_doc, 'isopret', u'isopret Documentation', - author, 'isopret', 'Isoform interpretation.', - 'Miscellaneous'), -] - - - diff --git a/docs/examples.rst b/docs/examples.md similarity index 57% rename from docs/examples.rst rename to docs/examples.md index 569bfbf5..b662cbbb 100644 --- a/docs/examples.rst +++ b/docs/examples.md @@ -1,51 +1,48 @@ -.. _rstexamples: -=============================== -Example input files for isopret -=============================== +# Example input files for isopret -RNA-Seq datasets were obtained from NCBI's `Sequence Read Archive (SRA) `_. + +RNA-Seq datasets were obtained from NCBI's [Sequence Read Archive (SRA)](https://www.ncbi.nlm.nih.gov/sra). These experiments study a broad range of conditons, but they all contained two distinct sets of samples that could be compared in a case-control design. A zip archive containing HBADEALS output files for each of the datasets can be obtained -from the zenodo archive `6483996 `_. - -.. _33524399: https://pubmed.ncbi.nlm.nih.gov/33524399/ -.. _31468702: https://pubmed.ncbi.nlm.nih.gov/31468702/ -.. _30641038: https://pubmed.ncbi.nlm.nih.gov/30641038/ -.. _33876188: https://pubmed.ncbi.nlm.nih.gov/33876188/ -.. _34413880: https://pubmed.ncbi.nlm.nih.gov/34413880/ -.. _34295332: https://pubmed.ncbi.nlm.nih.gov/34295332/ -.. _32152472: https://pubmed.ncbi.nlm.nih.gov/32152472/ -.. _30554964: https://pubmed.ncbi.nlm.nih.gov/30554964/ -.. _34669477: https://pubmed.ncbi.nlm.nih.gov/34669477/ -.. _33749664: https://pubmed.ncbi.nlm.nih.gov/33749664/ -.. _32243809: https://pubmed.ncbi.nlm.nih.gov/32243809/ -.. _30340504: https://pubmed.ncbi.nlm.nih.gov/30340504/ -.. _31829804: https://pubmed.ncbi.nlm.nih.gov/31829804/ -.. _32817457: https://pubmed.ncbi.nlm.nih.gov/32817457/ -.. _33188283: https://pubmed.ncbi.nlm.nih.gov/33188283/ -.. _30503522: https://pubmed.ncbi.nlm.nih.gov/30503522/ -.. _32487996: https://pubmed.ncbi.nlm.nih.gov/32487996/ -.. _31474574: https://pubmed.ncbi.nlm.nih.gov/31474574/ -.. _32735620: https://pubmed.ncbi.nlm.nih.gov/32735620/ -.. _32615169: https://pubmed.ncbi.nlm.nih.gov/32615169/ -.. _32441123: https://pubmed.ncbi.nlm.nih.gov/32441123/ -.. _31221130: https://pubmed.ncbi.nlm.nih.gov/31221130/ -.. _30135250: https://pubmed.ncbi.nlm.nih.gov/30135250/ -.. _33327559: https://pubmed.ncbi.nlm.nih.gov/33327559/ -.. _32294156: https://pubmed.ncbi.nlm.nih.gov/32294156/ -.. _34793334: https://pubmed.ncbi.nlm.nih.gov/34793334/ -.. _26679344: https://pubmed.ncbi.nlm.nih.gov/26679344/ -.. _34450236: https://pubmed.ncbi.nlm.nih.gov/34450236/ -.. _34215830: https://pubmed.ncbi.nlm.nih.gov/34215830/ -.. _32354235: https://pubmed.ncbi.nlm.nih.gov/32354235/ -.. _35310884: https://pubmed.ncbi.nlm.nih.gov/35310884/ -.. _24747576: https://pubmed.ncbi.nlm.nih.gov/24747576/ - - -Pathophysiology -############### +from the zenodo archive ([6483996](https://zenodo.org/record/6483996)). + +- [PMID:33524399](https://pubmed.ncbi.nlm.nih.gov/33524399/) +- [PMID:31468702](https://pubmed.ncbi.nlm.nih.gov/31468702/) +- [PMID:30641038](https://pubmed.ncbi.nlm.nih.gov/30641038/) +- [PMID:33876188](https://pubmed.ncbi.nlm.nih.gov/33876188/) +- [PMID:34413880](https://pubmed.ncbi.nlm.nih.gov/34413880/) +- [PMID:34295332](https://pubmed.ncbi.nlm.nih.gov/34295332/) +- [PMID:32152472](https://pubmed.ncbi.nlm.nih.gov/32152472/) +- [PMID:30554964](https://pubmed.ncbi.nlm.nih.gov/30554964/) +- [PMID:34669477](https://pubmed.ncbi.nlm.nih.gov/34669477/) +- [PMID:33749664](https://pubmed.ncbi.nlm.nih.gov/33749664/) +- [PMID:32243809](https://pubmed.ncbi.nlm.nih.gov/32243809/) +- [PMID:30340504](https://pubmed.ncbi.nlm.nih.gov/30340504/) +- [PMID:31829804](https://pubmed.ncbi.nlm.nih.gov/31829804/) +- [PMID:32817457](https://pubmed.ncbi.nlm.nih.gov/32817457/) +- [PMID:33188283](https://pubmed.ncbi.nlm.nih.gov/33188283/) +- [PMID:30503522](https://pubmed.ncbi.nlm.nih.gov/30503522/) +- [PMID:32487996](https://pubmed.ncbi.nlm.nih.gov/32487996/) +- [PMID:31474574](https://pubmed.ncbi.nlm.nih.gov/31474574/) +- [PMID:32735620](https://pubmed.ncbi.nlm.nih.gov/32735620/) +- [PMID:32615169](https://pubmed.ncbi.nlm.nih.gov/32615169/) +- [PMID:32441123](https://pubmed.ncbi.nlm.nih.gov/32441123/) +- [PMID:31221130](https://pubmed.ncbi.nlm.nih.gov/31221130/) +- [PMID:30135250](https://pubmed.ncbi.nlm.nih.gov/30135250/) +- [PMID:33327559](https://pubmed.ncbi.nlm.nih.gov/33327559/) +- [PMID:32294156](https://pubmed.ncbi.nlm.nih.gov/32294156/) +- [PMID:34793334](https://pubmed.ncbi.nlm.nih.gov/34793334/) +- [PMID:26679344](https://pubmed.ncbi.nlm.nih.gov/26679344/) +- [PMID:34450236](https://pubmed.ncbi.nlm.nih.gov/34450236/) +- [PMID:34215830](https://pubmed.ncbi.nlm.nih.gov/34215830/) +- [PMID:32354235](https://pubmed.ncbi.nlm.nih.gov/32354235/) +- [PMID:35310884](https://pubmed.ncbi.nlm.nih.gov/35310884/) +- [PMID:24747576](https://pubmed.ncbi.nlm.nih.gov/24747576/) + +## Pathophysiology + The RNA-seq datasets in this group involve investigations of diseases and disease mechanisms. @@ -93,34 +90,34 @@ The RNA-seq datasets in this group involve investigations of diseases and diseas -.. _32652799: https://pubmed.ncbi.nlm.nih.gov/32652799/ -.. _35115664: https://pubmed.ncbi.nlm.nih.gov/35115664/ -.. _29025019: https://pubmed.ncbi.nlm.nih.gov/29025019/ -.. _32891909: https://pubmed.ncbi.nlm.nih.gov/32891909/ -.. _31827286: https://pubmed.ncbi.nlm.nih.gov/31827286/ -.. _30333485: https://pubmed.ncbi.nlm.nih.gov/30333485/ -.. _33986176: https://pubmed.ncbi.nlm.nih.gov/33986176/ -.. _34755188: https://pubmed.ncbi.nlm.nih.gov/34755188/ -.. _34739170: https://pubmed.ncbi.nlm.nih.gov/34739170/ -.. _28101782: https://pubmed.ncbi.nlm.nih.gov/28101782/ -.. _34758327: https://pubmed.ncbi.nlm.nih.gov/34758327/ -.. _30993572: https://pubmed.ncbi.nlm.nih.gov/30993572/ -.. _32710624: https://pubmed.ncbi.nlm.nih.gov/32710624/ -.. _31339449: https://pubmed.ncbi.nlm.nih.gov/31339449/ -.. _32365352: https://pubmed.ncbi.nlm.nih.gov/32365352/ -.. _34587152: https://pubmed.ncbi.nlm.nih.gov/34587152/ -.. _31941840: https://pubmed.ncbi.nlm.nih.gov/31941840/ -.. _34784250: https://pubmed.ncbi.nlm.nih.gov/34784250/ -.. _32645954: https://pubmed.ncbi.nlm.nih.gov/32645954/ -.. _34215830: https://pubmed.ncbi.nlm.nih.gov/34215830/ -.. _34886891: https://pubmed.ncbi.nlm.nih.gov/34886891/ -.. _33024153: https://pubmed.ncbi.nlm.nih.gov/33024153/ -.. _31029854: https://pubmed.ncbi.nlm.nih.gov/31029854/ -.. _32066997: https://pubmed.ncbi.nlm.nih.gov/32066997/ -.. _30367166: https://pubmed.ncbi.nlm.nih.gov/30367166/ - -Physiology -########## +- [PMID:32652799](https://pubmed.ncbi.nlm.nih.gov/32652799/) +- [PMID:35115664](https://pubmed.ncbi.nlm.nih.gov/35115664/) +- [PMID:29025019](https://pubmed.ncbi.nlm.nih.gov/29025019/) +- [PMID:32891909](https://pubmed.ncbi.nlm.nih.gov/32891909/) +- [PMID:31827286](https://pubmed.ncbi.nlm.nih.gov/31827286/) +- [PMID:30333485](https://pubmed.ncbi.nlm.nih.gov/30333485/) +- [PMID:33986176](https://pubmed.ncbi.nlm.nih.gov/33986176/) +- [PMID:34755188](https://pubmed.ncbi.nlm.nih.gov/34755188/) +- [PMID:34739170](https://pubmed.ncbi.nlm.nih.gov/34739170/) +- [PMID:28101782](https://pubmed.ncbi.nlm.nih.gov/28101782/) +- [PMID:34758327](https://pubmed.ncbi.nlm.nih.gov/34758327/) +- [PMID:30993572](https://pubmed.ncbi.nlm.nih.gov/30993572/) +- [PMID:32710624](https://pubmed.ncbi.nlm.nih.gov/32710624/) +- [PMID:31339449](https://pubmed.ncbi.nlm.nih.gov/31339449/) +- [PMID:32365352](https://pubmed.ncbi.nlm.nih.gov/32365352/) +- [PMID:34587152](https://pubmed.ncbi.nlm.nih.gov/34587152/) +- [PMID:31941840](https://pubmed.ncbi.nlm.nih.gov/31941840/) +- [PMID:34784250](https://pubmed.ncbi.nlm.nih.gov/34784250/) +- [PMID:32645954](https://pubmed.ncbi.nlm.nih.gov/32645954/) +- [PMID:34215830](https://pubmed.ncbi.nlm.nih.gov/34215830/) +- [PMID:34886891](https://pubmed.ncbi.nlm.nih.gov/34886891/) +- [PMID:33024153](https://pubmed.ncbi.nlm.nih.gov/33024153/) +- [PMID:31029854](https://pubmed.ncbi.nlm.nih.gov/31029854/) +- [PMID:32066997](https://pubmed.ncbi.nlm.nih.gov/32066997/) +- [PMID:30367166](https://pubmed.ncbi.nlm.nih.gov/30367166/) + +## Physiology + The RNA-seq datasets in this group involve investigations of physiology, cell biology, and gene regulation. @@ -160,46 +157,44 @@ The RNA-seq datasets in this group involve investigations of physiology, cell bi SRP149366_329,4:4,breast cells,Meng P (2019),30993572_ -.. _34903601: https://pubmed.ncbi.nlm.nih.gov/34903601/ -.. _34458010: https://pubmed.ncbi.nlm.nih.gov/34458010/ -.. _32463448: https://pubmed.ncbi.nlm.nih.gov/32463448/ -.. _26566875: https://pubmed.ncbi.nlm.nih.gov/26566875/ -.. _34493867: https://pubmed.ncbi.nlm.nih.gov/34493867/ -.. _30014619: https://pubmed.ncbi.nlm.nih.gov/30014619/ -.. _34301266: https://pubmed.ncbi.nlm.nih.gov/34301266/ -.. _31048689: https://pubmed.ncbi.nlm.nih.gov/31048689/ -.. _29108258: https://pubmed.ncbi.nlm.nih.gov/29108258/ -.. _34546978: https://pubmed.ncbi.nlm.nih.gov/34546978/ -.. _34580113: https://pubmed.ncbi.nlm.nih.gov/34580113/ -.. _34986855: https://pubmed.ncbi.nlm.nih.gov/34986855/ -.. _34359754: https://pubmed.ncbi.nlm.nih.gov/34359754/ -.. _32109375: https://pubmed.ncbi.nlm.nih.gov/32109375/ -.. _29066513: https://pubmed.ncbi.nlm.nih.gov/29066513/ -.. _34238982: https://pubmed.ncbi.nlm.nih.gov/34238982/ -.. _34458146: https://pubmed.ncbi.nlm.nih.gov/34458146/ -.. _26367798: https://pubmed.ncbi.nlm.nih.gov/26367798/ -.. _24929677: https://pubmed.ncbi.nlm.nih.gov/24929677/ -.. _33203734: https://pubmed.ncbi.nlm.nih.gov/33203734/ -.. _28350380: https://pubmed.ncbi.nlm.nih.gov/28350380/ -.. _33499129: https://pubmed.ncbi.nlm.nih.gov/33499129/ -.. _34059508: https://pubmed.ncbi.nlm.nih.gov/34059508/ -.. _24176112: https://pubmed.ncbi.nlm.nih.gov/24176112/ -.. _33147339: https://pubmed.ncbi.nlm.nih.gov/33147339/ -.. _33318192: https://pubmed.ncbi.nlm.nih.gov/33318192/ -.. _30037853: https://pubmed.ncbi.nlm.nih.gov/30037853/ -.. _31843922: https://pubmed.ncbi.nlm.nih.gov/31843922/ -.. _29152775: https://pubmed.ncbi.nlm.nih.gov/29152775/ -.. _23024189: https://pubmed.ncbi.nlm.nih.gov/23024189/ -.. _33106178: https://pubmed.ncbi.nlm.nih.gov/33106178/ -.. _34270926: https://pubmed.ncbi.nlm.nih.gov/34270926/ -.. _30770362: https://pubmed.ncbi.nlm.nih.gov/30770362/ -.. _32629178: https://pubmed.ncbi.nlm.nih.gov/32629178/ -.. _32888433: https://pubmed.ncbi.nlm.nih.gov/32888433/ - - - -Cancer -###### +- [PMID:34903601](https://pubmed.ncbi.nlm.nih.gov/34903601/) +- [PMID:34458010](https://pubmed.ncbi.nlm.nih.gov/34458010/) +- [PMID:32463448](https://pubmed.ncbi.nlm.nih.gov/32463448/) +- [PMID:26566875](https://pubmed.ncbi.nlm.nih.gov/26566875/) +- [PMID:34493867](https://pubmed.ncbi.nlm.nih.gov/34493867/) +- [PMID:30014619](https://pubmed.ncbi.nlm.nih.gov/30014619/) +- [PMID:34301266](https://pubmed.ncbi.nlm.nih.gov/34301266/) +- [PMID:31048689](https://pubmed.ncbi.nlm.nih.gov/31048689/) +- [PMID:29108258](https://pubmed.ncbi.nlm.nih.gov/29108258/) +- [PMID:34546978](https://pubmed.ncbi.nlm.nih.gov/34546978/) +- [PMID:34580113](https://pubmed.ncbi.nlm.nih.gov/34580113/) +- [PMID:34986855](https://pubmed.ncbi.nlm.nih.gov/34986855/) +- [PMID:34359754](https://pubmed.ncbi.nlm.nih.gov/34359754/) +- [PMID:32109375](https://pubmed.ncbi.nlm.nih.gov/32109375/) +- [PMID:29066513](https://pubmed.ncbi.nlm.nih.gov/29066513/) +- [PMID:34238982](https://pubmed.ncbi.nlm.nih.gov/34238982/) +- [PMID:34458146](https://pubmed.ncbi.nlm.nih.gov/34458146/) +- [PMID:26367798](https://pubmed.ncbi.nlm.nih.gov/26367798/) +- [PMID:24929677](https://pubmed.ncbi.nlm.nih.gov/24929677/) +- [PMID:33203734](https://pubmed.ncbi.nlm.nih.gov/33203734/) +- [PMID:28350380](https://pubmed.ncbi.nlm.nih.gov/28350380/) +- [PMID:33499129](https://pubmed.ncbi.nlm.nih.gov/33499129/) +- [PMID:34059508](https://pubmed.ncbi.nlm.nih.gov/34059508/) +- [PMID:24176112](https://pubmed.ncbi.nlm.nih.gov/24176112/) +- [PMID:33147339](https://pubmed.ncbi.nlm.nih.gov/33147339/) +- [PMID:33318192](https://pubmed.ncbi.nlm.nih.gov/33318192/) +- [PMID:30037853](https://pubmed.ncbi.nlm.nih.gov/30037853/) +- [PMID:31843922](https://pubmed.ncbi.nlm.nih.gov/31843922/) +- [PMID:29152775](https://pubmed.ncbi.nlm.nih.gov/29152775/) +- [PMID:23024189](https://pubmed.ncbi.nlm.nih.gov/23024189/) +- [PMID:33106178](https://pubmed.ncbi.nlm.nih.gov/33106178/) +- [PMID:34270926](https://pubmed.ncbi.nlm.nih.gov/34270926/) +- [PMID:30770362](https://pubmed.ncbi.nlm.nih.gov/30770362/) +- [PMID:32629178](https://pubmed.ncbi.nlm.nih.gov/32629178/) +- [PMID:32888433](https://pubmed.ncbi.nlm.nih.gov/32888433/) + +## Cancer + The RNA-seq datasets in this group involve investigations of cancer. @@ -247,14 +242,12 @@ The RNA-seq datasets in this group involve investigations of cancer. SRP312693_314,4:4,medulloblastoma,Rea J (2021),34359754_ SRP254646_315,11:11,prostate cancer,He YD (2021),34301266_ -.. _26873097: https://pubmed.ncbi.nlm.nih.gov/26873097/ -.. _31434901: https://pubmed.ncbi.nlm.nih.gov/31434901/ -.. _33897690: https://pubmed.ncbi.nlm.nih.gov/33897690/ -.. _34128839: https://pubmed.ncbi.nlm.nih.gov/34128839/ -.. _31844885: https://pubmed.ncbi.nlm.nih.gov/31844885/ - -Infectious disease -################## +- [PMID:26873097](https://pubmed.ncbi.nlm.nih.gov/26873097/) +- [PMID:31434901](https://pubmed.ncbi.nlm.nih.gov/31434901/) +- [PMID:33897690](https://pubmed.ncbi.nlm.nih.gov/33897690/) +- [PMID:34128839](https://pubmed.ncbi.nlm.nih.gov/34128839/) +- [PMID:31844885](https://pubmed.ncbi.nlm.nih.gov/31844885/) +## Infectious disease The RNA-seq datasets in this group involve investigations of infectious disease. diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..a06c6576 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,21 @@ +# IsopretGO: Isoform interpretation + +IsopretGO (Isoform Interpretation for Gene Ontology) leverages predictions of isoform-specific functions (i.e., Gene Ontology[GO] annotations) +made by the [isopret expectation maximization algorithm](https://pubmed.ncbi.nlm.nih.gov/36929917/){:target="\_blank"}. +to perform gene-level and isoform-level GO overrepresentation analysis. + + + + +Isopret uses the analysis (output) file of +[HBA-DEALS](https://pubmed.ncbi.nlm.nih.gov/32660516/){:target="_blank"}. + +TODO -- point to tutorial amd also mention EDGER + +HBA-DEALS +analyzes RNA-Seq data to determine differentially expression and differential +splicing simultaneous. Isopret then performs +Gene Ontology analysis using a Java 17 implementation of code from +the [Ontologizer](https://pubmed.ncbi.nlm.nih.gov/18511468/){:target="_blank"}. + + diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index 492e728c..00000000 --- a/docs/index.rst +++ /dev/null @@ -1,28 +0,0 @@ -=============================== -Isopret: Isoform interpretation -=============================== - -Isopret (Isoform Interpretation) is an expectation-maximization algorithm able -to infer isoform specific functions using a global optimization approach. - - - - -Isopret uses the analysis (output) file of -`HBA-DEALS `_. HBA-DEALS -analyzes RNA-Seq data to determine differentially expression and differential -splicing simultaneous. Isopret then performs -Gene Ontology analysis using a Java 17 implementation of code from -the `Ontologizer `_. - - - -.. toctree:: - :maxdepth: 1 - :caption: Contents: - - input - examples - running-inferrence - running-go - diff --git a/docs/input.md b/docs/input.md new file mode 100644 index 00000000..eee1ce04 --- /dev/null +++ b/docs/input.md @@ -0,0 +1,76 @@ +# Input files + + +isopret can be run with `HBA-DEALS`_ or `edgeR`_ results files as input. + +## HBA-DEALS + + + +Hierarchical Bayesian Analysis of Differential Expression and ALternative Splicing (HBA-DEALS) +simultaneously characterizes differential expression and splicing in cohorts. + + +.. figure:: /img/hbadeals.png + :width: 80% + :align: center + + HBA-DEALS. The log-transformed expression of a gene with three isoforms (green, orange, and blue) is shown. The gene expression is the sum of the expression of the isoforms. Differential gene expression is modeled as two Normal distributions whose means differ by the parameter β. The proportions of the corresponding isoforms have a Dirichlet prior, and the difference in proportions between controls and cases is modeled by α (symbolized by the two triangles). An MCMC procedure is used to solve for the posterior distribution of the parameters of the model for all genes and isoforms at once. + +The HBA-DEALS algorithm is explained in [Karlebach et al, 2020, Genome Biology 21:171](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02072-6). +Source code is available at the [HBA-DEALS GitHub repository](https://github.com/TheJacksonLaboratory/HBA-DEALS). + + +### Important + + +For use with isopret, HBA-DEALS *must* be run with the ``isoform.level`` option set to true. +See the [HBADEALS documentation](https://hba-deals.readthedocs.io/en/latest/) +for details. + + + + +## HBA-DEALS output format + + +HBA-DEALS produces an output file with the following format. A gene is shown +using its [Ensembl](http://ensembl.org/) identifier. If the word ``Expression`` +appears in the second column, then the row refers to the gene, otherwise it +refers to the indicated transcript (isoform). The third column refers to the +expression log fold change (if the row refers to a gene) or the fold change +(if the row refers to an isoform). The fourth column is the posterior error probability (PEP; see +[Käll et al.](https://pubs.acs.org/doi/10.1021/pr700739d) for an explanation of PEP). + + +| Gene |Isoform | ExplogFC/FC | P | +|:----------------|:----------------|:----------------|:---------------| +| ENSG00000001629 | Expression | 1.17010167106799| 0.19007 | +| ENSG00000001629 | ENST00000265742 | 1.17010167106799| 0.22928 | +| ENSG00000001629 | ENST00000422095 | 0.51809521525105| 0.04285 | +| ENSG00000001629 | ENST00000442183 | 1.04618237020161| 0.23606 | +| ENSG00000002586 | Expression | 0.53692491414042| 0.05712 | +| ENSG00000002586 | ENST00000381177 | 1.63300567810385| 0.10156 | + + + + +IsopretGO currently supports +only [Ensembl](http://ensembl.org/) gene/transcript models. + + + + +## Running HBA-DEALS + + +A Snakemake pipeline for running HBA-DEALS is described in the +[GitHub repository](https://github.com/TheJacksonLaboratory/covid19splicing) +for the paper [Betacoronavirus-specific alternate splicing](https://pubmed.ncbi.nlm.nih.gov/35074468/). The GitHub repository +also contains 15 HBA-DEALS output files related to that publication that can be used as input for Isopret-Gui. + + +# edgeR + + +todo \ No newline at end of file diff --git a/docs/input.rst b/docs/input.rst deleted file mode 100644 index f141d975..00000000 --- a/docs/input.rst +++ /dev/null @@ -1,85 +0,0 @@ -.. _rstinput: - -=================== -Isopret input files -=================== - -isopret can be run with `HBA-DEALS`_ or `edgeR`_ results files as input. - -HBA-DEALS -========= -.. _HBA-DEALS: - - -Hierarchical Bayesian Analysis of Differential Expression and ALternative Splicing (HBA-DEALS) -simultaneously characterizes differential expression and splicing in cohorts. - - -.. figure:: /img/hbadeals.png - :width: 80% - :align: center - - HBA-DEALS. The log-transformed expression of a gene with three isoforms (green, orange, and blue) is shown. The gene expression is the sum of the expression of the isoforms. Differential gene expression is modeled as two Normal distributions whose means differ by the parameter β. The proportions of the corresponding isoforms have a Dirichlet prior, and the difference in proportions between controls and cases is modeled by α (symbolized by the two triangles). An MCMC procedure is used to solve for the posterior distribution of the parameters of the model for all genes and isoforms at once. - -The HBA-DEALS algorithm is explained in `Karlebach et al, 2020, Genome Biology 21:171 `_. -Source code is available at the `HBA-DEALS GitHub repository `_. - - -Important -========= - -For use with isopret, HBA-DEALS *must* be run with the ``isoform.level`` option set to true. -See the `HBADEALS documentation `_ -for details. - - - - -HBA-DEALS output format -^^^^^^^^^^^^^^^^^^^^^^^ - -HBA-DEALS produces an output file with the following format. A gene is shown -using its `Ensembl `_ identifier. If the word ``Expression`` -appears in the second column, then the row refers to the gene, otherwise it -refers to the indicated transcript (isoform). The third column refers to the -expression log fold change (if the row refers the a gene) or the fold change -(if the row refers to an isoform). The fourth column is the posterior error probability (PEP; see -`Käll et al. `_ for an explanation). - - -+-----------------+-----------------+-----------------+-----------------+ -| Gene |Isoform | ExplogFC/FC | P | -| | | | | -+=================+=================+=================+=================+ -| ENSG00000001629 | Expression | 1.17010167106799| 0.19007 | -+-----------------+-----------------+-----------------+-----------------+ -| ENSG00000001629 | ENST00000265742 | 1.17010167106799| 0.22928 | -+-----------------+-----------------+-----------------+-----------------+ -| ENSG00000001629 | ENST00000422095 | 0.51809521525105| 0.04285 | -+-----------------+-----------------+-----------------+-----------------+ -| ENSG00000001629 | ENST00000442183 | 1.04618237020161| 0.23606 | -+-----------------+-----------------+-----------------+-----------------+ -| ENSG00000002586 | Expression | 0.53692491414042| 0.05712 | -+-----------------+-----------------+-----------------+-----------------+ -| ENSG00000002586 | ENST00000381177 | 1.63300567810385| 0.10156 | -+-----------------+-----------------+-----------------+-----------------+ - - -Isopret supports -only `Ensembl `_ gene/transcript models. - - - - -Running HBA-DEALS -^^^^^^^^^^^^^^^^^ - -A Snakemake pipeline for running HBA-DEALS is described in the `GitHub repository `_ -for the paper `Betacoronavirus-specific alternate splicing `_. The GitHub repository -also contains 15 HBA-DEALS output files related to that publicaton that can be used as input for Isopret-Gui. - - -edgeR -===== -.. _edgeR: -todo \ No newline at end of file diff --git a/docs/output.rst b/docs/output.md similarity index 66% rename from docs/output.rst rename to docs/output.md index 50cfea14..f521fbed 100644 --- a/docs/output.rst +++ b/docs/output.md @@ -1,13 +1,10 @@ -.. _rstoutput: +# Output -============== -isopret output -============== Directly after finishing the analysis, Isopret shows the ``Analysis`` (Overview) tab, with a table with the counts of differentially expressed and differentially spliced genes (from the HBA-DEALS analysis) -and the counts of `Gene Ontology `_ (GO) terms found to be significantly -overrepresented among the differentiall expressed genes (DGE) and the differentially alternatively spliced +and the counts of [Gene Ontology](http://geneontology.org/) (GO) terms found to be significantly +overrepresented among the differentially expressed genes (DGE) and the differentially alternatively spliced isoforms (DAS). The lower table shows each gene with at least one count in the RNA-seq experiment, together with the @@ -16,15 +13,18 @@ of any isoform. Genes or isoforms whose PEP is less than the threshold determine false discovery rate (FDR) are highlighted in color. -.. figure:: /img/isopret3.png - :width: 70% - :align: center - Isopret. Overview tab. +
+![Overview tab](/img/isopret3.png){ width="1000" } +
IsopretGO overview tab. +
+
+ + + +## Gene view -Gene view -^^^^^^^^^ If you click on the ``Visualize`` button for a gene in the ``Analysis`` pane, Isopret will open a tab with visualizations and analysis results for that gene. The tab has three parts -- isoforms, protein domains, and GO analysis. @@ -32,53 +32,61 @@ and analysis results for that gene. The tab has three parts -- isoforms, protein The isoform section shows the structure of each isoform that has at least one read in the RNA-seq data. +
+![Gene tab](/img/isopret4.png){ width="1000" } +
IsopretGO gene tab, isoforms section. +
+
-.. figure:: /img/isopret4.png - :width: 70% - :align: center - Isopret. Gene tab, isoforms section -The protein domain section shows the protein domains (using data from `InterPro `_) of +The protein domain section shows the protein domains (using data from [InterPro](http://www.ebi.ac.uk/interpro/)) of each of the isoforms. -.. figure:: /img/isopret5.png - :width: 70% - :align: center +
+![Gene tab](/img/isopret5.png){ width="1000" } +
IsopretGO gene tab, protein domain section. +
+
- Isopret. Gene tab, protein domain section Finally, the GO section lists all of the GO annotations associated with the gene and shows the inferred isoform level annotations. GO terms that are significant for either expression or splicing at the experiment level are shown at the top of the table and highlighted green. -.. figure:: /img/isopret6.png - :width: 70% - :align: center - Isopret. Gene tab, Gene Ontology section +
+![Gene Ontology section](/img/isopret6.png){ width="1000" } +
IsopretGO gene tab, Gene Ontology section. +
+
+ + +## Gene Ontology view -Gene Ontology view -^^^^^^^^^^^^^^^^^^ The DGE (differential gene expression) and DAS (differential alternative splicing) tabs show the GO terms found to be overrepresented for expression or splicing. The table shows a list of the terms and the corresponding p-values. +
+![Gene Ontology DAS tab](/img/isopret7.png){ width="1000" } +
IsopretGO gene tab, Gene Ontology DAS tab. +
+
-.. figure:: /img/isopret7.png - :width: 70% - :align: center - - Isopret. Gene Ontology DAS tab Clicking on the ``Compare DGE & DAS`` button will open a dialog that shows the negative logarithm of the p-values of GO terms to compare results for expression and splicing. - +
+![Gene Ontology DAS tab](/img/isopret7.png){ width="1000" } +
IsopretGO gene tab, Gene Ontology DAS tab. +
+
.. figure:: /img/isopret8.png :width: 70% :align: center diff --git a/docs/running-go.rst b/docs/running-cmd.md similarity index 53% rename from docs/running-go.rst rename to docs/running-cmd.md index 8f2afe6f..ba04360a 100644 --- a/docs/running-go.rst +++ b/docs/running-cmd.md @@ -1,8 +1,5 @@ -.. _rstrunningcli: +# Command line -============================= -Running isopret: Command line -============================= Isopret offers most functionality via its graphical user interface (GUI) version, but does have a command-line tool that can be used to perform Gene Ontology (GO) @@ -15,98 +12,90 @@ directories if you like. For the following explanation, we will assume that the command-line app ``isopret-cli`` is in the current working directory. -The download command -~~~~~~~~~~~~~~~~~~~~ +### The download command -.. _rstdownload: -isopret requires some additional files to run. + +isopretGO requires some additional files to run. 1. ``go.json``. The main Gene Ontology file 2. ``goa_human.gaf``. The Gene Ontology annotation file -3. ``hg38_ensembl.ser`` The `Jannovar `_ transcript information file +3. ``hg38_ensembl.ser`` The [Jannovar](https://github.com/charite/jannovar) transcript information file 4. ``hgnc_complete_set.txt`` A file from HGNC with information about human genes 5. ``interpro_domains.txt`` A file from the isopret GitHub derived from interpro biomaRt data 6. ``interpro_domain_desc.txt`` A file from the isopret GitHub derived from interpro biomaRt data -7. ``isoform_function_list_mf.txt`` A file that is generated by the isoform function (GO molecular function) inferrence algorithm described in the manuscript. -8. ``isoform_function_list_bp.txt`` A file that is generated by the isoform function (GO biological process) inferrence algorithm described in the manuscript. +7. ``isoform_function_list_mf.txt`` A file that is generated by the isoform function (GO molecular function) inference algorithm described in the manuscript. +8. ``isoform_function_list_bp.txt`` A file that is generated by the isoform function (GO biological process) inference algorithm described in the manuscript. -isopret offers a convenience function to download all the files +isopretGO offers a convenience function to download all the files to a local directory (by default a subdirectory ``data`` is created in the current working directory). You can change this default with the ``-d`` or ``--data`` options (If you change this, then you will need to pass the location of your directory to all other isopret commands using the ``-d`` flag). Download the files automatically as follows. -.. code-block:: java - - java -jar isopret-cli.jar download - +```bash +java -jar isopret-cli.jar download +``` + isopret will not download the files if they are already present unless the ``--overwrite`` argument is passed. For instance, the following command would download the four files to a directory called datafiles and would overwrite any previously downloaded files. -.. code-block:: java - - java -jar isopret-cli.jar download -d datafiles --overwrite +```bash +java -jar isopret-cli.jar download -d datafiles --overwrite +``` - -If desired, you can download these files on your own but you need to place them all in the +If desired, you can download these files on your own, but you need to place them all in the same directory to run isopret. The GUI uses the same functionality to download these files to a directory that is chosen or created by the user; this directory can be used by both the command-line and the GUI version of isopret. -The GO command -^^^^^^^^^^^^^^ +## The GO command + This command performs Gene Ontology overrepresentation analysis. The download command must be run before running this command. The ``--hbadeals`` option is required and indicates the path to an HBA-DEALS output file. -.. code-block:: java - - java -jar isopret-cli.jar GO --hbadeals +```bash +java -jar isopret-cli.jar GO --hbadeals +``` The options are -+------------------+--------------------------------------------------------------------------------------+ -| Option | Explanation | -+==================+======================================================================================+ -| -b,--hbadeals | Path to HBA-DEALS output file (required) | -+------------------+--------------------------------------------------------------------------------------+ -| -c,--calculation | Ontologizer calculation (Term-for-Term, PC-Union, PC-Intersection; default: TfT) | -+------------------+--------------------------------------------------------------------------------------+ -| --mtc | Multiple-Testing-Correction for GO analysis (default Bonferroni) | -+------------------+--------------------------------------------------------------------------------------+ -| -v", --verbose | Print stats in shell | -+------------------+--------------------------------------------------------------------------------------+ -| --outfile | Name of output file to write stats (default gene-ontology-overrep-{input}.tsv | -+------------------+--------------------------------------------------------------------------------------+ +| Option | Explanation | +|:-----------------|:--------------------------------------------------------------------------------------| +| -b,--hbadeals | Path to HBA-DEALS output file (required) | +| -c,--calculation | Ontologizer calculation (Term-for-Term, PC-Union, PC-Intersection; default: TfT) | +| --mtc | Multiple-Testing-Correction for GO analysis (default Bonferroni) | +| -v", --verbose | Print stats in shell | +| --outfile | Name of output file to write stats (default gene-ontology-overrep-{input}.tsv | + + +## The interpro command -The interpro command -^^^^^^^^^^^^^^^^^^^^ This command performs interpro domain overrepresentation analysis. The download command must be run before running this command. The ``--hbadeals`` option is required and indicates the path to an HBA-DEALS output file. -.. code-block:: java - - java -jar isopret-cli.jar interpro --hbadeals +```bash +java -jar isopret-cli.jar interpro --hbadeals +``` The options are -+------------------+--------------------------------------------------------------------------------------+ + | Option | Explanation | -+==================+======================================================================================+ +|:-----------------|:-------------------------------------------------------------------------------------| | -b,--hbadeals | Path to HBA-DEALS output file (required) | -+------------------+--------------------------------------------------------------------------------------+ | -d,--download | data download directory (``data`` by default) | -+------------------+--------------------------------------------------------------------------------------+ | --outfile | Name of output file to write stats (default gene-ontology-overrep-{input}.tsv | -+------------------+--------------------------------------------------------------------------------------+ + + diff --git a/docs/running-gui.rst b/docs/running-gui.md similarity index 50% rename from docs/running-gui.rst rename to docs/running-gui.md index c3709b7f..36f522cd 100644 --- a/docs/running-gui.rst +++ b/docs/running-gui.md @@ -1,86 +1,85 @@ -.. _rstrunninggui: +# GUI version -============================ -Running isopret: GUI version -============================ Isopret-gui is a Java application for investigating and visualizing overrepresentation of -`Gene Ontology (GO) `_ annotations in differentially spliced or -differentially expressed genes. Only the graphical user interface (GUI) version of Isopret offers the full +[Gene Ontology (GO)](http://geneontology.org/) annotations in differentially spliced or +differentially expressed genes. Only the graphical user interface (GUI) version of isopretGO offers the full functionality. -Starting isopret for the first time: Downloading input files -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +### Starting isopretGO for the first time: Downloading input files -Isopret can be started with a double click (assuming Java 17 is installed on your computer) or from + +IsopretGO can be started with a double click (assuming Java 17 is installed on your computer) or from the command line as -.. code-block:: - :caption: starting isopret from the command line +``` shell title="Starting isopretGO from the command line" +java -jar Isopret.jar +``` - java -jar Isopret.jar +
+![First run](/img/isopret1.png){ width="1000" } +
Appearance of the app when started for the first time.
+
-.. figure:: /img/isopret1.png - :width: 70% - :align: center - Isopret. Appearance of the app when started for the first time. You should now click on the ``Download`` button and choose a directory to which to download the data files required -by isopret to run (these files include the inferred isoform function files of the project as well as other files such +by isopret to run (these files include the inferred isoformGO function files of the project as well as other files such as the Gene Ontology json file). This step only needs to be performed once. -Choosing the HBA-DEALS file -^^^^^^^^^^^^^^^^^^^^^^^^^^^ +### Choosing the HBA-DEALS file + It is assumed you will have run HBA-DEALS on RNA-seq files of interest prior to running Isopret. Choose the output file of HBA-DEALS. -Gene Ontology Settings -^^^^^^^^^^^^^^^^^^^^^^ +### Gene Ontology Settings + -Isopret offers three Gene Ontology (GO) overrepresentation algorithms. The ``Term-for-Term`` method is the +isopretGO offers three Gene Ontology (GO) overrepresentation algorithms. The ``Term-for-Term`` method is the standard procedure for assessing whether genes annotated to a specific GO term are more common in the set of differentially expressed genes than one would expect given the proportion of all genes that are annotated to the term. Formally, it is the upper tail of a hypergeometric distribution, which is also known as the one-tailed Fisher's exact test -(`Bauer et al., 2008 `_). +([Bauer et al., 2008](https://academic.oup.com/bioinformatics/article/24/14/1650/182451?login=false)). The drawback of the term-for-term approach is that it does not respect dependencies between the GO terms that are caused by overlapping annotations. As a result of the true-path rule, each term in GO shares all the annotations of all of its descendants. Isopret also offers two algorithms for GO analysis that assess GO term overrepresentation that examines each term in the context of its parent terms, -which we call the parent–child approach (`Grossmann et al, 2007 `_). +which we call the parent–child approach +([Grossmann et al., 2007](https://academic.oup.com/bioinformatics/article/23/22/3024/208216?login=false)). In our experiments, the ``parent–child-intersection`` approach is generally more conservative than the ``parent–child-union`` approach. -Multiple testing correction -^^^^^^^^^^^^^^^^^^^^^^^^^^^ +### Multiple testing correction + Isopret offers the following multiple-testing correction options for the GO analysis: Bonferroni, Bonferroni-Holm, -Sidak, Benjamini-Hochberg, Benjamini-Yukutieli, None. The book `Introduction to Bio-Ontologies `_ +Sidak, Benjamini-Hochberg, Benjamini-Yukutieli, None. The book +[Introduction to Bio-Ontologies](https://www.routledge.com/Introduction-to-Bio-Ontologies/Robinson-Bauer/p/book/9780367659271) provides detailed explanations of the GO Overrepresentation analysis procedures and multiple testing correction approaches. +
+![First run](/img/isopret2.png){ width="1000" } +
Appearance of the app after data download with an HBA-DEAL file chosen and the analysis set to +Parent-Child Intersection with Benjamini-Hochberg MTC.
+
+ -.. figure:: /img/isopret2.png - :width: 70% - :align: center - Isopret. Appearance of the app after data download with an HBA-DEAL file chosen and the analysis set to Parent-Child Intersection with Benjamini-Hochberg MTC.. +## Running isopretGO -Running isopret -^^^^^^^^^^^^^^^ Finally, click the ``Analyse`` button to start the analysis. The tool will typically require less than 5 minutes to complete on a typical laptop or consumer desktop. Users can follow progress with a progress bar. If analysis is slow, consider starting Isopret with additional memory. +```bash title="starting isopret with additional memory" +java -Xmx 8g -jar Isopret.jar +``` -.. code-block:: - :caption: starting isopret with additional memory - - java -Xmx 8g -jar Isopret.jar When the analysis finished, two new tabs will appear, DGE (differental gene expression) -and DAS (differential alternative splicing). See :ref:`rstoutput` for instructions on how to interpret the results. \ No newline at end of file +and DAS (differential alternative splicing). See [output](output.md) for instructions on how to interpret the results. \ No newline at end of file diff --git a/docs/running-inferrence.rst b/docs/running-inferrence.md similarity index 100% rename from docs/running-inferrence.rst rename to docs/running-inferrence.md diff --git a/docs/running-pdf.rst b/docs/running-pdf.md similarity index 58% rename from docs/running-pdf.rst rename to docs/running-pdf.md index bad7a314..25c3d57b 100644 --- a/docs/running-pdf.rst +++ b/docs/running-pdf.md @@ -1,14 +1,11 @@ -.. _rstrunningpdf: +# Exporting PDF file -=================================== -Running isopret: Exporting PDF file -=================================== Ispopret allows users to export PDF files of the isoform or protein domain graphics created by isorpret for each analyzed gene. To create the PDF file, isopret first creates a SVG file and then uses -the program ``rsvg-convert `_ +the program [rsvg-convert](https://helpmanual.io/help/rsvg-convert/) to convert the SVG file to a PDF file (if desired, the SVG file can be saved itself). @@ -22,30 +19,39 @@ In this case, you can save the SVG file and convert it to PDF using any method of choice. -Installing rsvg-convert -####################### +### Installing rsvg-convert -Macintosh -^^^^^^^^^ -The easiest way to install the software is with brew. :: +- Macintosh - brew install librsvg +The easiest way to install the software is with brew. -Debian-flavors of linux -^^^^^^^^^^^^^^^^^^^^^^^ +```bash +brew install librsvg +``` -On Ubuntu and other linuxes with apt-get, use the following command. :: + - apt-get install librsvg2-bin +- Debian-flavors of linux -Running rsvg-convert -#################### -If you are not able to get isopret to use rsvg-convert, but have it on your system +On Ubuntu and other linuxes with apt-get, use the following command. + +```bash +apt-get install librsvg2-bin +``` + + +## Running rsvg-convert + + +If you are not able to get isopretGO to use rsvg-convert, but have it on your system and have saved an SVG file, this is the command that is used to -convert the file to PDF. :: +convert the file to PDF. - commands = rsvg-convert -f pdf -o myfilename.pdf myfilename.svg +```bash +commands = rsvg-convert -f pdf -o myfilename.pdf myfilename.svg +``` + myfilename is the base file name of the SVG file and of the PDF file that will be created. \ No newline at end of file diff --git a/docs/running.rst b/docs/running.md similarity index 57% rename from docs/running.rst rename to docs/running.md index 8195682e..c50457dc 100644 --- a/docs/running.rst +++ b/docs/running.md @@ -1,19 +1,15 @@ -.. _rstrunning: +# Running isopretGO -=============== -Running isopret -=============== Isopret offers a command-line tools that performs Gene Ontology overrepresentation analysis with the inferred isoform GO annotations. A GUI version with additional visualization tools is in preparation. -.. toctree:: - :maxdepth: 1 - :caption: Contents: +- [running-inferrence](running-inferrence.md) +- [running-cmd](running-cmd.md) - running-inferrence - running-cmd + + diff --git a/docs/setup.rst b/docs/setup.md similarity index 70% rename from docs/setup.rst rename to docs/setup.md index 21c936cd..0c317345 100644 --- a/docs/setup.rst +++ b/docs/setup.md @@ -1,10 +1,8 @@ -.. _rstsetup: -====================== -Setting up isopret-gui -====================== +# Set up + isopret-gui is a desktop Java application. You can download precompiled executable -JAR files from the `Releases page `_ +JAR files from the [Releases page](https://github.com/TheJacksonLaboratory/isopret/releases). of the GitHub site. Later, we will generate stand-alone Windows and Mac native apps. This is currently the recommended way of using isopret-gui. @@ -12,18 +10,18 @@ This is currently the recommended way of using isopret-gui. Additionally, the following text describes how to build isopret-gui from source. -Prerequisites -~~~~~~~~~~~~~ +### Prerequisites + isopret-gui was written with Java version 17. If you want to build isopret-gui from source, then the build process described below requires -`Git `_ and `maven `_ (version 3.5.3 or higher). +[Git](https://git-scm.com/book/en/v2) and [maven](https://maven.apache.org/install.html). (version 3.5.3 or higher). + +### Installation -Installation -~~~~~~~~~~~~ -Go the GitHub page of `isopret `_, and clone the project. +Go the GitHub page of [isopretGO](https://github.com/TheJacksonLaboratory/isopretGO), and clone the project. Build the executable from source with maven, and then test the build. :: git clone https://github.com/TheJacksonLaboratory/isopret.git diff --git a/em_scripts/combine_tables.R b/em_scripts/combine_tables.R deleted file mode 100644 index 31ee3a73..00000000 --- a/em_scripts/combine_tables.R +++ /dev/null @@ -1,197 +0,0 @@ -library(data.table) - -library(parallel) - -library(Matrix) - - - -number.of.nodes=200 - -if (file.exists('convergence_log.txt')) -{ - if (as.integer(read.table('convergence_log.txt',header=F))==-1) - - quit('no') - -} - -combined.table=NULL - -combined.x.seq=NULL - -combined.isoform.functions=NULL - -combined.transcript.ids=NULL - -share.vec=c() - -fit.sample=c() - -new.coefs=NULL - -total.lik=0 - -#read - -for (node.number in 1:number.of.nodes) -{ - - load(paste0('interpro_state_',node.number,'.RData')) - - total.lik=total.lik+res.ga@fitnessValue/sum(lower.tri(seq.sim.mat) & compare.pairs) - - if (is.null(combined.table)) - { - combined.table=iso.has.func - - }else{ - - if (sum(colnames(iso.has.func) %in% colnames(combined.table))> lik_diff.txt')) - - dif.log=read.table('lik_diff.txt') - - if (nrow(dif.log)>25) - - if(cumsum(dif.log$V1)[nrow(dif.log)]-cumsum(dif.log$V1)[nrow(dif.log)-25]<1/3) - { - print('Algorithm Converged') - - share.vec2=share.vec^2 - - new.coefs=lm(fit.sample~share.vec+share.vec2)$coefficients - - if (!file.exists('convergence_log.txt')) - { - write.table(args[1],'convergence_log.txt',col.names = F,row.names = F,quote = F) - }else{ - prev.conv=read.table('convergence_log.txt',header=F) - - if ((as.integer(prev.conv)+1==as.integer(args[1])) || as.integer(prev.conv)==-1) - { - print('Local Maximum Reached') - - write.table(-1,'convergence_log.txt',col.names = F,row.names = F,quote = F) - - quit('no') - }else - { - write.table(args[1],'convergence_log.txt',col.names = F,row.names = F,quote = F) - } - - rm(prev.conv) - } - - } - - -}else{ - - writeMM(combined.table,'combined_iso_has_func.txt') - - write.table(colnames(combined.table),'colnames.txt',sep='\t',col.names = F,row.names = F,quote = F) - - write.table(rownames(combined.table),'rownames.txt',sep='\t',col.names = F,row.names = F,quote = F) - - write.table(total.lik,'last_lik.txt',sep='\t',col.names = F,row.names = F,quote = F) - - -} - - -#re-allocate - -rm(fit.sample) - -rm(share.vec) - -split.ids.comb=sample(1:number.of.nodes,length(combined.transcript.ids),replace=TRUE) - -for (node.number in 1:number.of.nodes) -{ - - transcript.ids=combined.transcript.ids[which(split.ids.comb==node.number)] - - x.seq=combined.x.seq[which(names(combined.x.seq) %in% transcript.ids)] - - isoform.functions=combined.isoform.functions[which(names(combined.isoform.functions) %in% transcript.ids)] - - iso.has.func=combined.table[rownames(combined.table) %in% transcript.ids,] - - if (!is.null(new.coefs)) - - coefs=new.coefs - - save(transcript.ids,x.seq,isoform.functions,iso.has.func,coefs,file=paste0('interpro_state_',node.number,'.RData')) - -} - diff --git a/em_scripts/predict2.R b/em_scripts/predict2.R deleted file mode 100644 index 271dc059..00000000 --- a/em_scripts/predict2.R +++ /dev/null @@ -1,385 +0,0 @@ -library(data.table) - -library(goseq) - -library(Biostrings) - -library(Rcpi) - -library(Matrix) - -library(GA) - -library(bestNormalize) - -#This script optimizes isoform function assignment to predict local alignment between isoforms optimally. There are 200 instances of this script -#that are executed each iteration, each handles a different subset of isoforms. - -#Change the following paths as needed: - -path.to.gtf.file='/projects/robinson-lab/USERS/karleg/projects/lps/sra/star_files/Homo_sapiens.GRCh38.91.gtf' - -path.to.hgnc='hgnc_complete_set.txt' - -path.to.interpro='interpro_domains.txt' - -path.to.gaf='goa_human.gaf' - -path.to.sequences='/projects/robinson-lab/USERS/karleg/projects/isopret/isoform_seqs' - -num.cores=4 #The number of cores that will be used by this script - - -#The following function calculates the fitness, i.e. minus the sum of squared residuals, of the regression model -#for a given assignment of GO terms to isoforms(the input parameter sol). The prefix 'ga' is used because the optimization of function -#assignment is performed by a Genetic Algorithm (appears later in this script) - -ga.fitness=function(sol) -{ - - number.shared.functions=Matrix::tcrossprod(Matrix::sparseMatrix(i=rep(1:length(transcript.ids),start.funcs)[sol==1], - j = unlist(isoform.functions)[sol==1],dims=c(nrow(iso.has.func),ncol(iso.has.func))),boolArith=F) - - b=number.shared.functions[lower.tri(number.shared.functions) & compare.pairs] - - a=seq.sim.mat[lower.tri(seq.sim.mat) & compare.pairs] - - v1=(b^2)*coefs[3]+b*coefs[2]+coefs[1] - - -sum((v1-a)^2) - -} - -calcParProtSeqSim=function (protlist, cores = 2, type = "local", submat = "BLOSUM62") -{ - doParallel::registerDoParallel(cores) - idx = combn(1:length(protlist), 2) - seqsimlist = vector("list", ncol(idx)) - seqsimlist <- foreach(i = 1:length(seqsimlist), .errorhandling = "pass") %dopar% - { - tmp <- IPcalcSeqPairSim(rev(idx[, i]), protlist = protlist, - type = type, submat = submat) - } - seqsimmat = matrix(0, length(protlist), length(protlist)) - for (i in 1:length(seqsimlist)) seqsimmat[idx[2, i], idx[1, - i]] = seqsimlist[[i]] - seqsimmat[upper.tri(seqsimmat)] = t(seqsimmat)[upper.tri(t(seqsimmat))] - diag(seqsimmat) = 1 - return(seqsimmat) -} - -IPcalcSeqPairSim=function (twoid,protlist = protlist, type = type, submat = submat) -{ - id1 = twoid[1] - id2 = twoid[2] - if (protlist[[id1]] == "" | protlist[[id2]] == "") { - sim = 0L - } - else { - s1 = try(Biostrings::AAString(protlist[[id1]]), silent = TRUE) - s2 = try(Biostrings::AAString(protlist[[id2]]), silent = TRUE) - s12 = try(Biostrings::pairwiseAlignment(s1, s2, type = type, - substitutionMatrix = submat, scoreOnly = TRUE), - silent = TRUE) - - if (is.numeric(s12) == FALSE ) { - sim = 0L - }else { - sim = s12 - } - } - return(sim) -} - -#The following function accepts a list of protein sequences (the products of isoforms), and returns their local alignment matrrix. -#This function and the one it calls(appears after it in the script) are mostly copied from another package, I just removed their normalization -#to get the textbook local alignment scores and not a number between 0 and 1 as in the original code. - - -pop.size=50 #Population side for the Genetic Algorithm that modifies the isoform function assignment to optimize the fitness funtion - -number.of.nodes=200 #This is the number of groups that the isoforms will be split into in order to speed up computation - -args=commandArgs(trailingOnly = TRUE) #The argument to this script is an integer between 1 and 200, and it gives the group of isoforms that -#the machine that called the script will process - -node.number=as.integer(args[1]) #convert the argument from string to integer - -if (file.exists('convergence_log.txt')) #if the algorithm converged (determined by a master script, combine_tables.R, that runs after all 200 instances of this script finished) don't do anything -{ - if (as.integer(read.table('convergence_log.txt',header=F))==-1) - - quit('no') - -} - -if (!file.exists(paste0('interpro_state_',node.number,'.RData'))) #if this is the first iteration of the algorithm -{ - - set.seed(123) #set a randon seed - - interpro.tab=read.table(path.to.interpro,sep='\t',header=TRUE) - - interpro.tab=interpro.tab[interpro.tab[,2]!="",] - - interpro.tab=interpro.tab[!duplicated(paste(interpro.tab[,1],interpro.tab[,2])),] - - colnames(interpro.tab)[2]='domain' - - interpro2go=fread('interpro2go',sep=';',header=FALSE,data.table = FALSE,skip = 3) - - interpro2go$V1=unlist(lapply(lapply(lapply(interpro2go$V1,strsplit,split=' '),unlist),'[[',1)) - - interpro.ids=interpro2go$V1 - - interpro2go=interpro2go[,2] - - names(interpro2go)=gsub('InterPro:','',interpro.ids) - - rm(interpro.ids) - - init.coefs=c(-0.04359383,0.23389276,0.39723520) - - #init - - coefs=init.coefs - - #The following lines read the GTF files with all genes and isoforms, and extract their Ensembl IDs - - gtf.file=fread(path.to.gtf.file,sep='\t',quote = '',data.table = FALSE) - - gtf.file=gtf.file[gtf.file$V3=='transcript',] - - transcript.ids=gsub(';','',unlist(lapply(strsplit(as.character(gtf.file[,9]),split=' '),'[[',6))) - - transcript.ids=gsub("\"",'',transcript.ids) - - gene.ids=gsub(';','',unlist(lapply(strsplit(as.character(gtf.file[,9]),split=' '),'[[',2))) - - gene.ids=gsub("\"",'',gene.ids) - - #Using the gene ensembl IDs from above, we obtain each gene's GO terms - - hgnc.tab=fread(path.to.hgnc,sep='\t',quote = '',data.table = FALSE) - - gaf.file=fread(path.to.gaf,sep='\t',quote = '',data.table = FALSE) - - gaf.file=gaf.file[gaf.file$V9=='F',] - - gene.functions=mclapply(unique(gene.ids),function(x){ - - if (sum(hgnc.tab$ensembl_gene_id==x)==0) - - return(NULL) - - next.uniprot=hgnc.tab$uniprot_ids[hgnc.tab$ensembl_gene_id==x] - - if (nchar(next.uniprot)<=1) - - return(NULL) - - all.trs=unique(transcript.ids[gene.ids==x]) - - domains=interpro.tab$domain[interpro.tab$ensembl_transcript_id %in% all.trs] - - interpro2go.terms=unique(interpro2go[names(interpro2go) %in% domains]) - - unique(c(gaf.file$V5[gaf.file$V2 %in% next.uniprot],interpro2go.terms)) - - - },mc.cores = num.cores) - - names(gene.functions)=unique(gene.ids) - - gene.functions=gene.functions[!unlist(lapply(gene.functions,is.null)) & !(unlist(lapply(gene.functions,length))==0)] - - #remove GO terms that are common to 10% of the genes or more - - remove.go=names(table(unlist(gene.functions))[table(unlist(gene.functions))>=length(unique(gene.ids))/10]) - - gene.functions=lapply(gene.functions,function(l)l[-which(l %in% remove.go)]) #removing GO terms that are too common - - gene.functions=gene.functions[unlist(lapply(gene.functions,length))>0] - - #check that we have a protein sequence for each isoform, if not (non-coding) remove its ID - - sequence.exists=transcript.ids %in% gsub('translated_','',gsub('.fa','',list.files(path.to.sequences))) - - gene.ids=gene.ids[sequence.exists] - - transcript.ids=transcript.ids[sequence.exists] - - # For each isoform ID make sure that it is not duplicated, that it belongs to the group processed by this instance of the script (node.number) - # and that the protein code does not contain characters other than amino acids - - unique.iso=!duplicated(transcript.ids) & ((1:length(transcript.ids))%%number.of.nodes+1==node.number) & unlist(mclapply(transcript.ids,function(x){ - - seq=(readAAStringSet(paste0(path.to.sequences,'/translated_',x,'.fa'))) - - if (length(seq)==0) - - return(FALSE) - - if (sum(!names(table(strsplit(as.character(seq),split=''))) %in% c("A", "C" ,"D" ,"E" ,"F", "G", "H" ,"I" ,"K" ,"L" ,"M", "N" ,"P", "Q", "R","S" ,"T", "V" ,"W" ,"Y"))>0) - - return(FALSE) - - TRUE - - },mc.cores = num.cores)) - - #remove gene and isoformn IDs that do not satisfy the check above - - gene.ids=gene.ids[unique.iso] - - transcript.ids=transcript.ids[unique.iso] - - #free memory - - rm(gtf.file) - - #for each isoform, list all the GO terms that it may be assigned, i.e. the GO terms that belong to its gene - - isoform.functions=gene.functions[gene.ids] - - names(isoform.functions)=transcript.ids - - #remove isoforms that did not have a GO term - - isoform.functions=isoform.functions[!(unlist(lapply(isoform.functions,is.null)) | unlist(lapply(isoform.functions,length))==0)] - - #keep only IDs of isoforms that have at least one function - - transcript.ids=names(isoform.functions) - - #free memory - - rm(gene.functions) - - #Create a binary matrix of isoform IDS X GO terms, so that entry i,j is 1 if isoform i has GO term j, otherwise 0 - - iso.has.func=do.call(cbind,mclapply(unique(unlist(isoform.functions)),function(x){ - - Matrix(as.integer(unlist(lapply(isoform.functions,function(l)x %in% l))),ncol=1) - - },mc.cores = num.cores)) - - colnames(iso.has.func)=unique(unlist(isoform.functions)) - - rownames(iso.has.func)=names(isoform.functions) - - print(paste0('Num isoforms: ',length(transcript.ids))) - - #free memory - - rm(sequence.exists) - - rm(unique.iso) - - #read the protein sequences for all the isoforms - - x.seq=mclapply(paste0(path.to.sequences,'/translated_',transcript.ids,'.fa'),function(x)as.character(readFASTA(x)),mc.cores = num.cores) - - print('Read sequences') - - #calculate local alignment between all pairs of isoforms - - seq.sim.mat=calcParProtSeqSim(x.seq, cores = num.cores, type = "local", submat = "BLOSUM62") - - #sugg is the initial assignment of functions to isoforms, since this is the first iteration it is set to NULL - - sugg=do.call(c,mclapply(transcript.ids,function(iso.itr) - { - domains=interpro.tab$domain[interpro.tab$ensembl_transcript_id==iso.itr] - - interpro2go.terms=interpro2go[names(interpro2go) %in% domains] - - isoform.functions[[iso.itr]] %in% interpro2go.terms - - },mc.cores = num.cores)) - - #Use integers instead of strings for identifying GO terms - - isoform.functions=mclapply(isoform.functions,function(l)which(colnames(iso.has.func) %in% l),mc.cores = num.cores) - - - -}else{ - - #If this is not the first iteration, load the information from the last iteration. This will include the isoform function assignments to - #all the isoforms that were randomly chosen by combine_tables.R for the subset processed by this instance of the script - - load(paste0('interpro_state_',node.number,'.RData')) - - #Use integers instead of strings for identifying GO terms - - isoform.functions=mclapply(isoform.functions,function(l)which(colnames(iso.has.func) %in% l),mc.cores = num.cores) - - #Set the initial values of a solution, i.e. the isform GO term assignment from end of the previous iteration - - sugg=c() - - for (iso.itr in 1:length(transcript.ids)) - - sugg=c(sugg,iso.has.func[iso.itr,isoform.functions[[iso.itr]]]) #this concatenates for each isoform the values of the - #Boolean matrix 'iso.has.func' that indicate whether it has each of the GO terms - #of the gene that contains it - - #Calculate the local alignment scores between all pairs of isoforms. This is needed because in each iterations a new set of isoform is being processed - - seq.sim.mat=calcParProtSeqSim(x.seq, cores = num.cores, type = "local", submat = "BLOSUM62") - -} - -print('Starting optimization') - -total.length=length(unlist(isoform.functions)) #The sum of the number of GO terms that each isoform can have. This is the length of a solution, -#because for each isoform we have to specify which of the candidate GO terms are assigned to it - -start.funcs=unlist(lapply(isoform.functions,function(l)length(l))) #The maximal number of functions each isoform can have - -#Next we compute the pairs of isoforms whose genes share at least one GO term. These are stored in a logical isoforms X isoforms matrix - -compare.pairs=do.call(cbind,mclapply(unique(unlist(isoform.functions)),function(x){ - - matrix(as.integer(unlist(lapply(isoform.functions,function(l)x %in% l))),ncol=1) - -},mc.cores = num.cores)) - -compare.pairs=(compare.pairs%*%t(compare.pairs))>0 - -#Run a genetic algorithm that will optimize the assignment of functions to isoforms such that the model fit, returned by the functions gs.fitness, -#is optimal. - -t.data=log_x(seq.sim.mat[lower.tri(seq.sim.mat) & compare.pairs])$x.t - -seq.sim.mat[lower.tri(seq.sim.mat) & compare.pairs]=t.data - -res.ga=ga(type = 'binary',fitness = ga.fitness,nBits = total.length,maxiter = 200,popSize = pop.size,suggestions = sugg,parallel=num.cores) - -#Next we extract the solution into a new matrix 'iso.has'func', so that the GO term assignment that it chose will be passed to the next iteration - -col.names.ihf=colnames(iso.has.func) - -row.names.ihf=rownames(iso.has.func) - -iso.has.func=Matrix(0,nrow=nrow(iso.has.func),ncol=ncol(iso.has.func)) #Start with a matrix of zeroes - -if (sum(res.ga@solution[1,])>0) - - #Wherever the value of the solution is 1 (a function is assigned to an isoform) , set the value of the matrix 'iso.has.func' - #to 1 in the row that corresponds to the isoform and the column that corresponds to the GO function - - iso.has.func[cbind(rep(1:length(transcript.ids),start.funcs)[res.ga@solution[1,]==1],unlist(isoform.functions)[res.ga@solution[1,]==1])]=1 - -colnames(iso.has.func)=col.names.ihf - -rownames(iso.has.func)=row.names.ihf - -rm(comp.vals) - -#Transform the GO terms from integers back to strings, for the master script's use - -isoform.functions=mclapply(isoform.functions,function(l)colnames(iso.has.func)[l],mc.cores = num.cores) - -save.image(paste0('interpro_state_',node.number,'.RData')) #save the state for the master script's use diff --git a/em_scripts/translate_all_isoforms.R b/em_scripts/translate_all_isoforms.R deleted file mode 100644 index ba16d2c4..00000000 --- a/em_scripts/translate_all_isoforms.R +++ /dev/null @@ -1,38 +0,0 @@ -library(seqinr) - -library(Biostrings) - -library(data.table) - - -#Change the following paths as needed: - -path.to.gtf='/Users/karleg/STAR/STAR/bin/MacOSX_x86_64/data/GRCh38/annotation/Homo_sapiens.GRCh38.91.gtf' - -fasta.file='/Users/karleg/STAR/STAR/bin/MacOSX_x86_64/data/GRCh38/sequence/GRCh38_r91.all.fa' - -path.to.gffread='gffread' - -#In order to use the gffread tool for translating isoforms we need to read the GTF file: - -gtf.file=fread(path.to.gtf,sep='\t',quote = '',data.table = FALSE) - -gtf.file=gtf.file[gtf.file$V3=='exon',] - -transcript.ids=gsub(';','',unlist(lapply(strsplit(as.character(gtf.file[,9]),split=' '),'[[',6))) - -transcript.ids=gsub("\"",'',transcript.ids) - -#The following path to the genomic sequence is passed to the gffread tool in order for it to generate the protein sequence: - -input=mclapply(unique(transcript.ids),function(isoform.id) -{ - tr.gtf=gtf.file[transcript.ids == isoform.id,] - - write.table(tr.gtf,paste0("transcript",isoform.id,".gtf"),sep='\t',col.names = FALSE,row.names = FALSE,quote = FALSE) - - system(paste0(path.to.gffread,' -y isoform_seqs/translated_',isoform.id,'.fa -g ',fasta.file," transcript",isoform.id,".gtf")) - - system(paste0("rm transcript",isoform.id,".gtf")) - -},mc.cores = 32) \ No newline at end of file diff --git a/isopret-cli/pom.xml b/isopret-cli/pom.xml index ceeae9f8..889efc67 100644 --- a/isopret-cli/pom.xml +++ b/isopret-cli/pom.xml @@ -6,7 +6,7 @@ org.jax.isopret isopret - 1.1.16 + 1.1.17 isopret-cli @@ -19,6 +19,11 @@ isopret-core ${project.parent.version} + + org.jax.isopret + isopret-io + ${project.parent.version} + org.freemarker freemarker diff --git a/isopret-cli/src/main/java/module-info.java b/isopret-cli/src/main/java/module-info.java index 839cbe09..8aa922e3 100644 --- a/isopret-cli/src/main/java/module-info.java +++ b/isopret-cli/src/main/java/module-info.java @@ -3,6 +3,7 @@ requires org.monarchinitiative.phenol.core; requires org.slf4j; requires org.jax.isopret.core; + requires org.jax.isopret.io; requires org.monarchinitiative.phenol.analysis; opens org.jax.isopret.cli.command to info.picocli; diff --git a/isopret-cli/src/main/java/org/jax/isopret/cli/command/DownloadCommand.java b/isopret-cli/src/main/java/org/jax/isopret/cli/command/DownloadCommand.java index 76cb67fb..85e7e933 100644 --- a/isopret-cli/src/main/java/org/jax/isopret/cli/command/DownloadCommand.java +++ b/isopret-cli/src/main/java/org/jax/isopret/cli/command/DownloadCommand.java @@ -1,6 +1,6 @@ package org.jax.isopret.cli.command; -import org.jax.isopret.core.IsopretDownloader; +import org.jax.isopret.io.IsopretDownloader; import picocli.CommandLine; import java.util.concurrent.Callable; diff --git a/isopret-cli/src/main/java/org/jax/isopret/cli/command/GoOverrepCommand.java b/isopret-cli/src/main/java/org/jax/isopret/cli/command/GoOverrepCommand.java index b9726eaf..b15493c5 100644 --- a/isopret-cli/src/main/java/org/jax/isopret/cli/command/GoOverrepCommand.java +++ b/isopret-cli/src/main/java/org/jax/isopret/cli/command/GoOverrepCommand.java @@ -63,6 +63,11 @@ static class ExclusiveInputFile { " None)") private String mtc = "Bonferroni"; + @CommandLine.Option(names = {"-o", "--outdir"}, + description = "output directory") + private File outdir = null; + + @CommandLine.Option(names = {"--export-all"}, description = "Export results for all GO terms (i.e., do not threshold by p-value)") boolean exportAll = false; @@ -162,13 +167,18 @@ private void writeGoResultsToFile(GoAnalysisResults results, List dasGoTerms = results.dasGoTerms(); List dgeGoTerms = results.dgeGoTerms(); - - LOGGER.info("Writing GO Overrepresentation analysis results to {}", outfile); + File output_path; + if (outdir == null) { + output_path = new File(outfile); + } else { + output_path = new File(outdir + File.separator + outfile); + } + LOGGER.info("Writing GO Overrepresentation analysis results to {}", output_path.getAbsolutePath()); int totalDasGoTerms = dasGoTerms.size(); int outputDasGoTerms = 0; int totalDgeGoTerms = dgeGoTerms.size(); int outputDgeGoTerms = 0; - try (BufferedWriter bw = new BufferedWriter(new FileWriter(outfile))) { + try (BufferedWriter bw = new BufferedWriter(new FileWriter(output_path))) { for (var cts : dasGoTerms) { try { if (exportAll) { @@ -203,7 +213,8 @@ private void writeGoResultsToFile(GoAnalysisResults results, } } } catch (IOException e) { - e.printStackTrace(); + LOGGER.error("Could not write GO overrep results to file: {}", e.getMessage()); + return; } System.out.printf("We output %d/%d DGE GO terms and %d/%d DAS GO terms", outputDgeGoTerms, totalDgeGoTerms, outputDasGoTerms, totalDasGoTerms); diff --git a/isopret-cli/src/main/java/org/jax/isopret/cli/command/InterproOverrepCommand.java b/isopret-cli/src/main/java/org/jax/isopret/cli/command/InterproOverrepCommand.java index 430c9441..9dd859ce 100644 --- a/isopret-cli/src/main/java/org/jax/isopret/cli/command/InterproOverrepCommand.java +++ b/isopret-cli/src/main/java/org/jax/isopret/cli/command/InterproOverrepCommand.java @@ -56,7 +56,7 @@ public Integer call() { try (BufferedWriter bw = new BufferedWriter(new FileWriter(outfile))) { bw.write(visualizer.getTsv()); } catch (IOException e) { - e.printStackTrace(); + LOGGER.error("Could not write interpro overrep file: {}", e.getMessage()); } return null; } diff --git a/isopret-core/pom.xml b/isopret-core/pom.xml index ed2dd643..38761683 100644 --- a/isopret-core/pom.xml +++ b/isopret-core/pom.xml @@ -6,7 +6,7 @@ org.jax.isopret isopret - 1.1.16 + 1.1.17 isopret-core diff --git a/isopret-core/src/main/java/module-info.java b/isopret-core/src/main/java/module-info.java index 7c6fa8d2..7ebce401 100644 --- a/isopret-core/src/main/java/module-info.java +++ b/isopret-core/src/main/java/module-info.java @@ -1,7 +1,7 @@ module org.jax.isopret.core { requires org.slf4j; requires org.monarchinitiative.phenol.core; - requires org.monarchitiative.svart; + requires org.monarchinitiative.svart; requires org.monarchinitiative.phenol.analysis; requires jannovar.core; requires org.apache.commons.net; diff --git a/isopret-core/src/main/java/org/jax/isopret/core/configuration/IsopretBuilder.java b/isopret-core/src/main/java/org/jax/isopret/core/configuration/IsopretBuilder.java deleted file mode 100644 index b8530167..00000000 --- a/isopret-core/src/main/java/org/jax/isopret/core/configuration/IsopretBuilder.java +++ /dev/null @@ -1,23 +0,0 @@ -package org.jax.isopret.core.configuration; - -import org.jax.isopret.except.IsopretRuntimeException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.nio.file.Path; -import java.util.Objects; - -public class IsopretBuilder { - private final static Logger LOGGER = LoggerFactory.getLogger(IsopretBuilder.class); - - private final Path dataDirectory; - private final IsopretDataResolver dataResolver; - public static IsopretBuilder builder(Path liricalDataDirectory) throws IsopretRuntimeException { - return new IsopretBuilder(liricalDataDirectory); - } - - private IsopretBuilder(Path dataDirectory) throws IsopretRuntimeException { - this.dataDirectory = Objects.requireNonNull(dataDirectory); - this.dataResolver = IsopretDataResolver.of(dataDirectory); - } -} diff --git a/isopret-core/src/main/java/org/jax/isopret/core/configuration/IsopretDataResolver.java b/isopret-core/src/main/java/org/jax/isopret/core/configuration/IsopretDataResolver.java index 80c970c3..8003b9f3 100644 --- a/isopret-core/src/main/java/org/jax/isopret/core/configuration/IsopretDataResolver.java +++ b/isopret-core/src/main/java/org/jax/isopret/core/configuration/IsopretDataResolver.java @@ -1,6 +1,6 @@ package org.jax.isopret.core.configuration; -import org.jax.isopret.core.DownloadItem; +//import org.jax.isopret.core.DownloadItem; import org.jax.isopret.except.IsopretRuntimeException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -34,21 +34,15 @@ public class IsopretDataResolver { private static final String ISOFORM_FUNCTION_BP_FILENAME = "isoform_function_list_bp.txt"; private static final String ISOFORM_FUNCTION_CC_URL = ZENODO_BASE_URL + "files/isoform_function_list_cc.txt?download=1"; private static final String ISOFORM_FUNCTION_CC_FILENAME = "isoform_function_list_cc.txt"; + /* private static final DownloadItem go = makeItem(GO_JSON_URL, GO_JSON); private static final DownloadItem jannovarHg38 = makeItem(JANNOVAR_ZENODO_URL, JANNOVAR_FILENAME); private static final DownloadItem hgnc = makeItem(HGNC_URL,HGNC_FILENAME); - private static final DownloadItem interproDomainDesc = makeItem(INTERPRO_DOMAIN_DESC_URL, INTERPRO_DOMAIN_DESC_FILENAME); - private static final DownloadItem interproDomains = makeItem(INTERPRO_DOMAINS_URL, INTERPRO_DOMAINS_FILENAME); - private static final DownloadItem isoformFunctionMf = makeItem(ISOFORM_FUNCTION_MF_URL, ISOFORM_FUNCTION_MF_FILENAME); - private static final DownloadItem isoformFunctionBp = makeItem(ISOFORM_FUNCTION_BP_URL, ISOFORM_FUNCTION_BP_FILENAME); - - private static final DownloadItem isoformFunctionCc = makeItem(ISOFORM_FUNCTION_CC_URL, ISOFORM_FUNCTION_CC_FILENAME); - static DownloadItem makeItem(String urlString, String base) { try { URL url = new URL(urlString); @@ -61,7 +55,7 @@ static DownloadItem makeItem(String urlString, String base) { private static final Set allDownloadItems = Set.of(go, jannovarHg38, hgnc, interproDomainDesc, interproDomains, isoformFunctionMf, isoformFunctionBp, isoformFunctionCc); - + */ @@ -116,9 +110,9 @@ public Path interproDomainsPath() { } - /** @return Download data for Gene Ontology (go.json)> */ + /** @return Download data for Gene Ontology (go.json)> public static DownloadItem go() { return go; } - /** @return Download data for Gene Ontology (go.json)> */ + /** @return Download data for Gene Ontology (go.json)> public static DownloadItem jannovarHg38() { return jannovarHg38; } public static DownloadItem hgnc() { return hgnc; } public static DownloadItem interproDomainDesc() { return interproDomainDesc; } @@ -127,7 +121,7 @@ public Path interproDomainsPath() { public static DownloadItem isoformFunctionBp() { return isoformFunctionBp; } public static DownloadItem isoformFunctionCc() { return isoformFunctionCc; } public static Set allDownloadItems() { return allDownloadItems; } - +*/ } diff --git a/isopret-gui/pom.xml b/isopret-gui/pom.xml index 6b79c3ac..da38c458 100644 --- a/isopret-gui/pom.xml +++ b/isopret-gui/pom.xml @@ -9,7 +9,7 @@ org.jax.isopret isopret - 1.1.16 + 1.1.17 isopret-gui @@ -20,6 +20,11 @@ isopret-core ${project.parent.version} + + org.jax.isopret + isopret-io + ${project.parent.version} + net.rgielen javafx-weaver-spring-boot-starter diff --git a/isopret-gui/src/main/java/module-info.TMP b/isopret-gui/src/main/java/module-info.TMP new file mode 100644 index 00000000..b0bf2038 --- /dev/null +++ b/isopret-gui/src/main/java/module-info.TMP @@ -0,0 +1,22 @@ +module org.jax.isopret.gui { + requires org.slf4j; + requires org.monarchinitiative.phenol.core; + requires org.monarchinitiative.svart; + requires org.monarchinitiative.phenol.analysis; + requires jannovar.core; + requires org.apache.commons.net; + requires org.monarchinitiative.phenol.io; + requires guava; + + + requires org.jax.isopret.core.analysis; + requires org.jax.isopret.core.impl.go; + requires org.jax.isopret.model; + requires org.jax.isopret.except; + requires org.jax.isopret.core.impl.rnaseqdata; + requires org.jax.isopret.visualization; + requires org.jax.isopret.core; + requires org.jax.isopret.core.impl.interpro; + requires org.jax.isopret.io; + +} \ No newline at end of file diff --git a/isopret-gui/src/main/java/org/jax/isopret/gui/controller/AnalysisController.java b/isopret-gui/src/main/java/org/jax/isopret/gui/controller/AnalysisController.java index 94e8ae52..faa1f62b 100644 --- a/isopret-gui/src/main/java/org/jax/isopret/gui/controller/AnalysisController.java +++ b/isopret-gui/src/main/java/org/jax/isopret/gui/controller/AnalysisController.java @@ -319,7 +319,7 @@ private void openHbaDealsResultInTab(Visualizable hbadealsResult) { tabPane.getSelectionModel().select(tab); this.openTabs.put(hbadealsResult, tab); } catch (IOException e) { - e.printStackTrace(); + LOGGER.error("Could not load hbaGenePane.fxml file: {}", e.getMessage()); } diff --git a/isopret-gui/src/main/java/org/jax/isopret/gui/controller/MainController.java b/isopret-gui/src/main/java/org/jax/isopret/gui/controller/MainController.java index 68778fcc..d881a69f 100644 --- a/isopret-gui/src/main/java/org/jax/isopret/gui/controller/MainController.java +++ b/isopret-gui/src/main/java/org/jax/isopret/gui/controller/MainController.java @@ -12,7 +12,7 @@ import javafx.scene.layout.BorderPane; import javafx.stage.DirectoryChooser; import javafx.stage.FileChooser; -import org.jax.isopret.core.InputFileChecker; +import org.jax.isopret.io.InputFileChecker; import org.jax.isopret.core.InterproAnalysisResults; import org.jax.isopret.core.IsopretInterpoAnalysisRunner; import org.jax.isopret.core.analysis.InterproFisherExact; @@ -458,7 +458,7 @@ public void manualDownload(ActionEvent actionEvent) { } InputFileChecker checker = new InputFileChecker(dir.getAbsolutePath()); - DownloadPopup pop = new DownloadPopup(checker.getSuccessulDownloads(), + DownloadPopup pop = new DownloadPopup(checker.getSuccessfulDownloads(), checker.getFailedDownloads(), dir.getAbsolutePath(), this.hostServicesWrapper); diff --git a/isopret-gui/src/main/java/org/jax/isopret/gui/service/IsopretFxDownloadTask.java b/isopret-gui/src/main/java/org/jax/isopret/gui/service/IsopretFxDownloadTask.java index f9600c05..da90f40c 100644 --- a/isopret-gui/src/main/java/org/jax/isopret/gui/service/IsopretFxDownloadTask.java +++ b/isopret-gui/src/main/java/org/jax/isopret/gui/service/IsopretFxDownloadTask.java @@ -1,7 +1,7 @@ package org.jax.isopret.gui.service; import javafx.concurrent.Task; -import org.jax.isopret.core.IsopretDownloader; +import org.jax.isopret.io.IsopretDownloader; /** * A version of {@link IsopretDownloader} intended to be used as a {@link Task} in diff --git a/isopret-gui/src/main/java/org/jax/isopret/gui/service/impl/IsopretServiceImpl.java b/isopret-gui/src/main/java/org/jax/isopret/gui/service/impl/IsopretServiceImpl.java index 43838625..298f05c6 100644 --- a/isopret-gui/src/main/java/org/jax/isopret/gui/service/impl/IsopretServiceImpl.java +++ b/isopret-gui/src/main/java/org/jax/isopret/gui/service/impl/IsopretServiceImpl.java @@ -121,7 +121,7 @@ public void saveSettings() { try { pgProperties.store(new FileWriter(isopretSettingsFile), "store to properties file"); } catch (IOException e) { - e.printStackTrace(); + LOGGER.error("Could not write settings: {}", e.getMessage()); } } diff --git a/isopret-io/pom.xml b/isopret-io/pom.xml new file mode 100644 index 00000000..fc389981 --- /dev/null +++ b/isopret-io/pom.xml @@ -0,0 +1,27 @@ + + + 4.0.0 + + org.jax.isopret + isopret + 1.1.17 + + jar + isopret-io + + + 21 + 21 + UTF-8 + + + + + commons-net + commons-net + + + + \ No newline at end of file diff --git a/isopret-io/src/main/java/module-info.java b/isopret-io/src/main/java/module-info.java new file mode 100644 index 00000000..1d75e002 --- /dev/null +++ b/isopret-io/src/main/java/module-info.java @@ -0,0 +1,6 @@ +module org.jax.isopret.io { + requires org.slf4j; + requires org.apache.commons.net; + + exports org.jax.isopret.io; +} \ No newline at end of file diff --git a/isopret-core/src/main/java/org/jax/isopret/core/InputFileChecker.java b/isopret-io/src/main/java/org/jax/isopret/io/InputFileChecker.java similarity index 74% rename from isopret-core/src/main/java/org/jax/isopret/core/InputFileChecker.java rename to isopret-io/src/main/java/org/jax/isopret/io/InputFileChecker.java index 50a9a735..b81bdf46 100644 --- a/isopret-core/src/main/java/org/jax/isopret/core/InputFileChecker.java +++ b/isopret-io/src/main/java/org/jax/isopret/io/InputFileChecker.java @@ -1,12 +1,13 @@ -package org.jax.isopret.core; +package org.jax.isopret.io; -import org.jax.isopret.core.configuration.IsopretDataResolver; import java.nio.file.Path; import java.util.HashMap; import java.util.Map; import java.util.Set; +import org.jax.isopret.io.impl.DownloadItem; +import org.jax.isopret.io.impl.IsopretDataResolver; /** * If there is an issue with the downloads, this class will check each expected file and will @@ -17,12 +18,12 @@ public class InputFileChecker { - private final Map successulDownloads; + private final Map successfulDownloads; private final Map failedDownloads; public InputFileChecker(String dataDownload) { - successulDownloads = new HashMap<>(); + successfulDownloads = new HashMap<>(); failedDownloads = new HashMap<>(); Set items = IsopretDataResolver.allDownloadItems(); for (var ditem: items) { @@ -30,15 +31,15 @@ public InputFileChecker(String dataDownload) { String url = ditem.url().toString(); Path path = Path.of(dataDownload).resolve(basename); if (path.toFile().isFile()) { - successulDownloads.put(basename, url); + successfulDownloads.put(basename, url); } else { failedDownloads.put(basename, url); } } } - public Map getSuccessulDownloads() { - return successulDownloads; + public Map getSuccessfulDownloads() { + return successfulDownloads; } public Map getFailedDownloads() { diff --git a/isopret-core/src/main/java/org/jax/isopret/core/IsopretDownloader.java b/isopret-io/src/main/java/org/jax/isopret/io/IsopretDownloader.java similarity index 95% rename from isopret-core/src/main/java/org/jax/isopret/core/IsopretDownloader.java rename to isopret-io/src/main/java/org/jax/isopret/io/IsopretDownloader.java index 22c63e86..57ba2c3e 100644 --- a/isopret-core/src/main/java/org/jax/isopret/core/IsopretDownloader.java +++ b/isopret-io/src/main/java/org/jax/isopret/io/IsopretDownloader.java @@ -1,10 +1,7 @@ -package org.jax.isopret.core; +package org.jax.isopret.io; + -import org.jax.isopret.core.configuration.IsopretDataResolver; -import org.jax.isopret.except.IsopretRuntimeException; -import org.jax.isopret.core.impl.download.FileDownloadException; -import org.jax.isopret.core.impl.download.FileDownloader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -17,6 +14,11 @@ import java.nio.file.Paths; import java.util.zip.GZIPInputStream; +import org.jax.isopret.io.impl.DownloadItem; +import org.jax.isopret.io.impl.IsopretDataResolver; +import org.jax.isopret.io.impl.IsopretRuntimeException; +import org.jax.isopret.io.download.*; + /** * Command to download the {@code hp.obo} and {@code phenotype.hpoa} files that * we will need to run Isopret. Note that this class is also used by diff --git a/isopret-core/src/main/java/org/jax/isopret/core/impl/download/FileDownloadException.java b/isopret-io/src/main/java/org/jax/isopret/io/download/FileDownloadException.java similarity index 86% rename from isopret-core/src/main/java/org/jax/isopret/core/impl/download/FileDownloadException.java rename to isopret-io/src/main/java/org/jax/isopret/io/download/FileDownloadException.java index 6ca8a94c..661ae7ed 100644 --- a/isopret-core/src/main/java/org/jax/isopret/core/impl/download/FileDownloadException.java +++ b/isopret-io/src/main/java/org/jax/isopret/io/download/FileDownloadException.java @@ -1,4 +1,4 @@ -package org.jax.isopret.core.impl.download; +package org.jax.isopret.io.download; public class FileDownloadException extends Exception { diff --git a/isopret-core/src/main/java/org/jax/isopret/core/impl/download/FileDownloader.java b/isopret-io/src/main/java/org/jax/isopret/io/download/FileDownloader.java similarity index 99% rename from isopret-core/src/main/java/org/jax/isopret/core/impl/download/FileDownloader.java rename to isopret-io/src/main/java/org/jax/isopret/io/download/FileDownloader.java index 405883ca..2f0cacf8 100644 --- a/isopret-core/src/main/java/org/jax/isopret/core/impl/download/FileDownloader.java +++ b/isopret-io/src/main/java/org/jax/isopret/io/download/FileDownloader.java @@ -1,4 +1,4 @@ -package org.jax.isopret.core.impl.download; +package org.jax.isopret.io.download; import org.apache.commons.net.ftp.FTP; import org.apache.commons.net.ftp.FTPClient; diff --git a/isopret-core/src/main/java/org/jax/isopret/core/impl/download/ProgressBar.java b/isopret-io/src/main/java/org/jax/isopret/io/download/ProgressBar.java similarity index 97% rename from isopret-core/src/main/java/org/jax/isopret/core/impl/download/ProgressBar.java rename to isopret-io/src/main/java/org/jax/isopret/io/download/ProgressBar.java index f3590b21..3ac1a93f 100644 --- a/isopret-core/src/main/java/org/jax/isopret/core/impl/download/ProgressBar.java +++ b/isopret-io/src/main/java/org/jax/isopret/io/download/ProgressBar.java @@ -1,4 +1,4 @@ -package org.jax.isopret.core.impl.download; +package org.jax.isopret.io.download; /** diff --git a/isopret-core/src/main/java/org/jax/isopret/core/DownloadItem.java b/isopret-io/src/main/java/org/jax/isopret/io/impl/DownloadItem.java similarity index 89% rename from isopret-core/src/main/java/org/jax/isopret/core/DownloadItem.java rename to isopret-io/src/main/java/org/jax/isopret/io/impl/DownloadItem.java index 2ecea16b..43736545 100644 --- a/isopret-core/src/main/java/org/jax/isopret/core/DownloadItem.java +++ b/isopret-io/src/main/java/org/jax/isopret/io/impl/DownloadItem.java @@ -1,8 +1,8 @@ -package org.jax.isopret.core; +package org.jax.isopret.io.impl; import java.net.URL; -public class DownloadItem{ +public class DownloadItem { private final URL url; private final String basename; diff --git a/isopret-io/src/main/java/org/jax/isopret/io/impl/IsopretDataResolver.java b/isopret-io/src/main/java/org/jax/isopret/io/impl/IsopretDataResolver.java new file mode 100644 index 00000000..ceb7ed93 --- /dev/null +++ b/isopret-io/src/main/java/org/jax/isopret/io/impl/IsopretDataResolver.java @@ -0,0 +1,132 @@ +package org.jax.isopret.io.impl; + + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.net.MalformedURLException; +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import java.util.Objects; +import java.util.Set; + +public class IsopretDataResolver { + + private static final Logger LOGGER = LoggerFactory.getLogger(IsopretDataResolver.class); + private final static String GO_JSON = "go.json"; + private final static String GO_JSON_URL = "http://purl.obolibrary.org/obo/go.json"; + private static final String JANNOVAR_ZENODO_URL = "https://zenodo.org/record/4311513/files/hg38_ensembl.ser?download=1"; + private static final String JANNOVAR_FILENAME = "hg38_ensembl.ser"; + private static final String HGNC_URL = "ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt"; + private static final String HGNC_FILENAME = "hgnc_complete_set.txt"; + /** The Base URL of the zenodo repository where we store various input files for Isopret. */ + private static final String ZENODO_BASE_URL = "https://zenodo.org/record/6466530/"; + private static final String INTERPRO_DOMAIN_DESC_URL = ZENODO_BASE_URL + "files/interpro_domain_desc.txt?download=1"; + private static final String INTERPRO_DOMAIN_DESC_FILENAME = "interpro_domain_desc.txt"; + private static final String INTERPRO_DOMAINS_URL = ZENODO_BASE_URL + "files/interpro_domains.txt?download=1"; + private static final String INTERPRO_DOMAINS_FILENAME = "interpro_domains.txt"; + private static final String ISOFORM_FUNCTION_MF_URL = ZENODO_BASE_URL + "files/isoform_function_list_mf.txt?download=1"; + private static final String ISOFORM_FUNCTION_MF_FILENAME = "isoform_function_list_mf.txt"; + private static final String ISOFORM_FUNCTION_BP_URL = ZENODO_BASE_URL + "files/isoform_function_list_bp.txt?download=1"; + private static final String ISOFORM_FUNCTION_BP_FILENAME = "isoform_function_list_bp.txt"; + private static final String ISOFORM_FUNCTION_CC_URL = ZENODO_BASE_URL + "files/isoform_function_list_cc.txt?download=1"; + private static final String ISOFORM_FUNCTION_CC_FILENAME = "isoform_function_list_cc.txt"; + private static final DownloadItem go = makeItem(GO_JSON_URL, GO_JSON); + private static final DownloadItem jannovarHg38 = makeItem(JANNOVAR_ZENODO_URL, JANNOVAR_FILENAME); + private static final DownloadItem hgnc = makeItem(HGNC_URL,HGNC_FILENAME); + + private static final DownloadItem interproDomainDesc = makeItem(INTERPRO_DOMAIN_DESC_URL, INTERPRO_DOMAIN_DESC_FILENAME); + + private static final DownloadItem interproDomains = makeItem(INTERPRO_DOMAINS_URL, INTERPRO_DOMAINS_FILENAME); + + private static final DownloadItem isoformFunctionMf = makeItem(ISOFORM_FUNCTION_MF_URL, ISOFORM_FUNCTION_MF_FILENAME); + + private static final DownloadItem isoformFunctionBp = makeItem(ISOFORM_FUNCTION_BP_URL, ISOFORM_FUNCTION_BP_FILENAME); + + + private static final DownloadItem isoformFunctionCc = makeItem(ISOFORM_FUNCTION_CC_URL, ISOFORM_FUNCTION_CC_FILENAME); + + static DownloadItem makeItem(String urlString, String base) { + try { + URL url = new URL(urlString); + return new DownloadItem(url, base); + } catch (MalformedURLException e) { + // should never happen + throw new RuntimeException("Could not create URL from " + urlString); + } + } + private static final Set allDownloadItems = Set.of(go, + jannovarHg38, hgnc, interproDomainDesc, interproDomains, + isoformFunctionMf, isoformFunctionBp, isoformFunctionCc); + + + + + private final Path dataDirectory; + + public static IsopretDataResolver of(Path dataDirectory) throws RuntimeException { + return new IsopretDataResolver(dataDirectory); + } + + public IsopretDataResolver(Path dataDirectory) throws RuntimeException { + this.dataDirectory = Objects.requireNonNull(dataDirectory, "Data directory must not be null!"); + LOGGER.debug("Using isopret directory at `{}`.", dataDirectory.toAbsolutePath()); + checkResources(); + } + + private void checkResources() throws RuntimeException { + boolean error = false; + List requiredFiles = List.of(goJson(), isoformFunctionListBp(), isoformFunctionListCc(), isoformFunctionListMf(), + hg38Ensembl(), hgncCompleteSet(), interproDomainDescPath(), interproDomainsPath()); + for (Path file : requiredFiles) { + if (!Files.isRegularFile(file)) { + LOGGER.error("Missing required file `{}` in `{}`.", file.toFile().getName(), dataDirectory.toAbsolutePath()); + error = true; + } + } + if (error) { + throw new IsopretRuntimeException("Missing one or more resource files in isopret data directory!"); + } + } + + public Path goJson() { + return dataDirectory.resolve(GO_JSON); + } + public Path isoformFunctionListBp() { + return dataDirectory.resolve(ISOFORM_FUNCTION_BP_FILENAME); + } + public Path isoformFunctionListCc() { + return dataDirectory.resolve(ISOFORM_FUNCTION_CC_FILENAME); + } + public Path isoformFunctionListMf() { + return dataDirectory.resolve(ISOFORM_FUNCTION_MF_FILENAME); + } + public Path hg38Ensembl() { return dataDirectory.resolve(JANNOVAR_FILENAME); } + public Path hgncCompleteSet() { + return dataDirectory.resolve(HGNC_FILENAME); + } + public Path interproDomainDescPath() { + return dataDirectory.resolve(INTERPRO_DOMAIN_DESC_FILENAME); + } + public Path interproDomainsPath() { + return dataDirectory.resolve(INTERPRO_DOMAINS_FILENAME); + } + + + /** @return Download data for Gene Ontology (go.json)> */ + public static DownloadItem go() { return go; } + /** @return Download data for Gene Ontology (go.json)> */ + public static DownloadItem jannovarHg38() { return jannovarHg38; } + public static DownloadItem hgnc() { return hgnc; } + public static DownloadItem interproDomainDesc() { return interproDomainDesc; } + public static DownloadItem interproDomains() { return interproDomains; } + public static DownloadItem isoformFunctionMf() { return isoformFunctionMf; } + public static DownloadItem isoformFunctionBp() { return isoformFunctionBp; } + public static DownloadItem isoformFunctionCc() { return isoformFunctionCc; } + public static Set allDownloadItems() { return allDownloadItems; } + + + +} diff --git a/isopret-io/src/main/java/org/jax/isopret/io/impl/IsopretRuntimeException.java b/isopret-io/src/main/java/org/jax/isopret/io/impl/IsopretRuntimeException.java new file mode 100644 index 00000000..89582a37 --- /dev/null +++ b/isopret-io/src/main/java/org/jax/isopret/io/impl/IsopretRuntimeException.java @@ -0,0 +1,6 @@ +package org.jax.isopret.io.impl; +public class IsopretRuntimeException extends RuntimeException { + + public IsopretRuntimeException() { super();} + public IsopretRuntimeException(String m) { super(m);} +} diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 00000000..ecaaaba7 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,77 @@ +site_name: pyphetools + +theme: + name: "material" + features: + #- navigation.tabs + - navigation.sections + - toc.integrate + - navigation.top + - search.suggest + - search.highlight + - content.tabs.link + - content.code.annotation + - content.code.copy + - content.code.select + language: en + palette: + - scheme: default + toggle: + icon: material/toggle-switch-off-outline + name: Switch to dark mode + primary: indigo + accent: white + - scheme: slate + toggle: + icon: material/toggle-switch + name: Switch to light mode + primary: teal + accent: lime + + +#nav: +# - Home: 'index.md' + + +plugins: + - social + - search + - mkdocstrings: + handlers: + python: + options: + docstring_style: sphinx + + +extra: + social: + - icon: fontawesome/brands/github-alt + link: https://github.com/pnrobinson + - icon: fontawesome/brands/twitter + link: https://twitter.com/pnrobins + - icon: fontawesome/brands/linkedin + link: https://www.linkedin.com/in/peter-n-robinson-b7833811/ + + +markdown_extensions: + - pymdownx.highlight: + anchor_linenums: true + - pymdownx.inlinehilite + - pymdownx.snippets + - admonition + - pymdownx.arithmatex: + generic: true + - footnotes + - pymdownx.details + - pymdownx.superfences + - pymdownx.mark + - attr_list + - md_in_html + - tables + - pymdownx.emoji: + emoji_index: !!python/name:materialx.emoji.twemoji + emoji_generator: !!python/name:materialx.emoji.to_svg + +copyright: | + © 2023 Peter N Robinson + diff --git a/pom.xml b/pom.xml index b9bed921..fcb80acc 100644 --- a/pom.xml +++ b/pom.xml @@ -3,20 +3,19 @@ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 - - org.springframework.boot spring-boot-starter-parent - 3.0.3 + 3.2.0 org.jax.isopret isopret - 1.1.16 + 1.1.17 isopret-core isopret-cli isopret-gui + isopret-io pom @@ -30,14 +29,14 @@ ${java.version} ${java.version} ${java.version} - 2.0.0-RC5 + 2.0.4 4.3.8.RELEASE - 2.13.2 - 2.8.0 + 2.16.0 + 2.8.0 5.9.0 1.9.0 0.38 - 2.0.0-RC1 + 2.0.0-RC5 @@ -58,12 +57,12 @@ commons-net commons-net - 3.8.0 + 3.9.0 commons-io commons-io - ${commons.io} + ${commons.io.version} @@ -91,7 +90,7 @@ org.yaml snakeyaml - 2.0 + 2.2 @@ -186,6 +185,29 @@ false + + org.apache.maven.plugins + maven-enforcer-plugin + 3.3.0 + + + enforce-versions + + enforce + + + + + 3.0.5 + + + 17 + + + + + + diff --git a/scripts/do_wilcoxon.py b/scripts/do_wilcoxon.py new file mode 100644 index 00000000..7b68bd64 --- /dev/null +++ b/scripts/do_wilcoxon.py @@ -0,0 +1,191 @@ +from collections import defaultdict +import os +import re +import argparse +import math +import numpy as np +from scipy import stats + +parser = argparse.ArgumentParser() +parser.add_argument('-i', '--input_dir', help='input directory', type=str) +args = parser.parse_args() +input_dir = args.input_dir +if input_dir is None: + print("Need to provide path to isorpet file directory with -i argument") + exit(1) +if not os.path.isdir(input_dir): + print(f"-i argument ({input_dir}) was not a directory") + exit(1) +print(f"Searching {input_dir} for isopret files") + + +def disencombobulate(frac_perc_string): + """_summary_ parse an isopretGO output string + + Args: + frac_perc_string (_type_): a string such as 7/29(24.1%) + + Returns: + _type_: the elements of the string as int,int,float, e.g., 7, 29, 24.1 + """ + pattern = r"(\d+)\/(\d+)\(([\d.]+)%\)" + m = re.match(pattern, frac_perc_string) + if m: + numerator = int(m.group(1)) + denominator = int(m.group(2)) + perc = float(m.group(3)) + return numerator, denominator, perc + else: + return None, None, None + +class DgeDasRank: + def __init__(self, category, go_label, go_id, study, population, pval, adj_pval): + """_summary_ + category = fields[0] + go_label = fields[1] + go_id = fields[2] + study = fields[3] + population = fields[4] + pval = fields[5] + adj_pval = fields[6] + Args: + go_id (_type_): ID of a GO term + dge_n (int): number of differentially expressed genes + das_n (int): number of genes with at least one differentially spliced isoform + """ + self._go_id = go_id + self._go_label = go_label + if category == 'DAS': + n, m, perc = disencombobulate(study) + self._study_das_n = n + self._study_das_m = m + self._study_das_perc = perc + self._study_dge_n = None + self._study_dge_m = None + self._study_dge_perc = None + if adj_pval is None: + adj_pval = "1.0" + self._das_pval = -1* math.log10(float(adj_pval)) + self._dge_pval = None + elif category == 'DGE': + n, m, perc = disencombobulate(study) + self._study_das_n = None + self._study_das_m = None + self._study_das_perc = None + self._study_dge_n = n + self._study_dge_m = m + self._study_dge_perc = perc + if adj_pval is None: + adj_pval = "1.0" + self._dge_pval = -1* math.log10(float(adj_pval)) + self._das_pval = None + + def add_line(self, category, go_label, go_id, study, population, pval, adj_pval): + if category == 'DAS': + n, m, perc = disencombobulate(study) + if self._study_das_n is not None: + raise ValueError(f"Attempt to add DAS data twice for {go_label}") + self._study_das_n = n + self._study_das_m = m + self._study_das_perc = perc + self._das_pval = -1* math.log10(float(adj_pval)) + elif category == 'DGE': + n, m, perc = disencombobulate(study) + if self._study_dge_n is not None: + raise ValueError(f"Attempt to add DGE data twice for {go_label}") + self._study_dge_n = n + self._study_dge_m = m + self._study_dge_perc = perc + self._dge_pval = -1* math.log10(float(adj_pval)) + + def get_das_percentage(self): + return self._study_das_perc + + def get_dge_percentage(self): + return self._study_dge_perc + + def get_das_pvalue(self): + if self._das_pval is None: + return 0.0 + else: + return self._das_pval + + def get_dge_pvalue(self): + if self._dge_pval is None: + return 0.0 + else: + return self._dge_pval + + def get_go_id(self): + return self._go_id + + +def get_all_isopret_output_files(indir): + onlyfiles = [os.path.join(indir, f) for f in os.listdir(indir) if os.path.isfile(os.path.join(indir, f))] + return [f for f in onlyfiles if f.endswith(".tsv")] + + + + + + +go_id_to_label_d = defaultdict() +das_pval_d = defaultdict(list) +dge_pval_d = defaultdict(list) + + +isopret_files = get_all_isopret_output_files(args.input_dir) +print(f"We got {len(isopret_files)} input files") + +for fname in isopret_files: + das_d = defaultdict(DgeDasRank) + with open(fname) as f: + for line in f: + # print(line) + fields = line.strip().split('\t') + category = fields[0] + go_label = fields[1] + go_id = fields[2] + go_id_to_label_d[go_id] = go_label + study = fields[3] + population = fields[4] + pval = fields[5] + adj_pval = fields[6] + if go_id in das_d: + dgedas = das_d.get(go_id) + dgedas.add_line(category, go_label, go_id, study, population, pval, adj_pval) + else: + das_d[go_id] = DgeDasRank(category, go_label, go_id, study, population, pval, adj_pval) + for dgedas in das_d.values(): + go_id = dgedas.get_go_id() + das = dgedas.get_das_pvalue() + dge = dgedas.get_dge_pvalue() + das_pval_d[go_id].append(das) + dge_pval_d[go_id].append(dge) + +N = len(das_pval_d) + +for go_id in das_pval_d.keys(): + das_pval_list = das_pval_d.get(go_id) + dge_pval_list = dge_pval_d.get(go_id) + go_label = go_id_to_label_d.get(go_id) + T, p = stats.wilcoxon(das_pval_list, dge_pval_list, zero_method='zsplit', correction=False, alternative='two-sided') + if p*N < 0.05: + mean_das = np.mean(das_pval_list) + mean_dge = np.mean(dge_pval_list) + #if mean_das > mean_dge: + print(f"{go_label} ({go_id}): T={T}; p={p} (Bonferroni); mean DAS: {mean_das}; mean DGE: {mean_dge}") + + + + + + + + + + + + + + diff --git a/scripts/run_isopret_go.sh b/scripts/run_isopret_go.sh new file mode 100755 index 00000000..de74cade --- /dev/null +++ b/scripts/run_isopret_go.sh @@ -0,0 +1,44 @@ + + + +while getopts i:o: flag + do + case "${flag}" in + i) input_directory=${OPTARG};; + o) output_directory=${OPTARG};; + esac + done + +if [ ! -d "${input_directory}" ] +then + echo "Need to provide -i argument" + exit 1 +fi + +if [ ! -d "${output_directory}" ] +then + echo "Need to provide -o argument (create empty directory if necessary!)" + exit 1 +fi + +echo "Running isopretGO from ${input_directory} and placing output in ${output_directory}" + +# first build latest version +#cd ..; mvn clean package + +# Get path of current file and its directory +path=`readlink -f "${BASH_SOURCE:-$0}"` +DIR_PATH=`dirname $path` +PARENT_DIR=`dirname $DIR_PATH` +JAR_PATH="$PARENT_DIR/isopret-cli/target/isopret-cli.jar" +DATA_PATH="$PARENT_DIR/data" + + + +for entry in "$input_directory"/* +do + if [[ $entry == *.txt ]]; then + echo "java -jar $JAR_PATH GO -d $DATA_PATH -b $entry -o ${output_directory} --export-all" + cd ..;java -jar $JAR_PATH GO -d $DATA_PATH -b $entry -o ${output_directory} --export-all + fi +done \ No newline at end of file