diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..aa09f5c --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,35 @@ +name: Publish Python Package to PyPI + +on: + release: + types: [published] + workflow_dispatch: + +jobs: + Publish: + # prevents this action from running on forks + if: github.repository == 'open2c/assemblyinfo' + + runs-on: ubuntu-latest + permissions: + id-token: write + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.x" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + + - name: Build + run: python -m build + + - name: Publish distribution 📦 to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/README.md b/README.md index 87d582a..3b54041 100644 --- a/README.md +++ b/README.md @@ -1 +1,113 @@ -# assemblyinfo +# Assemblyinfo: Interact with assembly metadata in Python + +![CI](https://github.com/open2c/assemblyinfo/actions/workflows/ci.yml/badge.svg) +[![Docs status](https://readthedocs.org/projects/genomeinfo/badge/)](https://genomeinfo.readthedocs.io/en/latest/) +[![Slack](https://img.shields.io/badge/chat-slack-%233F0F3F?logo=slack)](https://bit.ly/open2c-slack) + +Assemblyinfo simplifies the management and analysis of genome assembly metadata in Python. + +This package provides: + +* Efficient tools for querying and manipulating assembly information datasets. +* Streamlined methods for importing, exporting, and converting between common chromosome formats. +* Utilities for retrieving assembly statistics across different versions or species. + +Read the [documentation](https://genomeinfo.readthedocs.io/en/latest/) for more information. + + +## Installation + +Bioframe is available on [PyPI](https://pypi.org/project/bioframe/): + +```sh +pip install assemblyinfo +``` + +## Basic operations on chromosome data + +Assemblyinfo offers a flexible and straigthforward interface to interact and perform basic queries. + +```python +import assemblyinfo + +db = assemblyinfo.connect() +hg38 = db.assembly_info("hg38", roles=["assembled"]) +``` + +Easily allows getting chromosome sizes: + +```text +hg38.chromsizes + +> name +> chr1 248956422 +> chr2 242193529 +> ... +``` + +chromosome equivalences: + +```text +hg38.chromeq + +> ncbi genbank refseq +> chr1 1 CM000663.2 NC_000001.11 +> chr2 2 CM000664.2 NC_000002.12 +> chr3 3 CM000665.2 NC_000003.12 +> ... +``` + +or assembly metadata: + +```text +hg38.metadata + +> {'assembly_level': 'Chromosome', + 'assembly_type': 'haploid-with-alt-loci', + 'bioproject': 'PRJNA168', + 'submitter': 'Genome Reference Consortium', + 'synonyms': ['GRCh38', 'hg38'], + 'taxid': '9606', + 'species': 'homo_sapiens', + 'common_name': 'human', + ... } +``` + +and more! + +# Request an assembly + +Feel free to open an issue and request a non-reference assembly! Current supported species are: + +```plaintext +['caenorhabditis_elegans', + 'homo_sapiens', + 'mus_musculus', + 'drosophila_melanogaster', + 'danio_rerio', + 'bos_taurus', + 'gallus_gallus', + 'canis_lupus_familiaris'] +``` + +You also can easily see which specific assemblies are supported by: + +```python +db = assemblyinfo.connect() +db.available_assemblies() +``` + +## Citing + +If you use ***assemblyinfo*** in your work, please refer to: + +```bibtex +@software{assemblyinfo_2024, + author = {Open2C}, + title = {assemblyinfo}, + year = {2024}, + publisher = {Github}, + version = {v0.0.1}, + url = {https://github.com/open2c/assemblyinfo} +} +``` diff --git a/assemblyinfo/core/assembly.py b/assemblyinfo/core/assembly.py index 1b5b9e1..f23dbb2 100644 --- a/assemblyinfo/core/assembly.py +++ b/assemblyinfo/core/assembly.py @@ -35,8 +35,8 @@ def chromeq(self) -> Dict[str, Dict[str, str]]: return pd.DataFrame(self.aliases).T def __repr__(self): - return (f"Assembly(assembly={self.assembly}", - f"species={self.species}", + return (f"Assembly(assembly={self.assembly}, " + f"species={self.species}, " f"common_name={self.common_name})") diff --git a/docs/tutorials/get_quick_stats.ipynb b/docs/tutorials/get_quick_stats.ipynb index 2d00e90..7b23297 100644 --- a/docs/tutorials/get_quick_stats.ipynb +++ b/docs/tutorials/get_quick_stats.ipynb @@ -7,63 +7,37 @@ "source": [ "# Get quick insights about the species you're working on!\n", "\n", - "Retrieve useful stats using GenomeInfo for desired species." + "Retrieve useful stats using AssemblyInfo for desired species." ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "685ac1c1-bb6d-4b4e-a785-a101e09ee983", - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "/home/alejandro/Documents/projects/forks/genomeinfo\n", "Defaulting to user installation because normal site-packages is not writeable\n", - "Obtaining file:///home/alejandro/Documents/projects/forks/genomeinfo\n", - " Installing build dependencies ... \u001b[?25ldone\n", - "\u001b[?25h Checking if build backend supports build_editable ... \u001b[?25ldone\n", - "\u001b[?25h Getting requirements to build editable ... \u001b[?25ldone\n", - "\u001b[?25h Installing backend dependencies ... \u001b[?25ldone\n", - "\u001b[?25h Preparing editable metadata (pyproject.toml) ... \u001b[?25ldone\n", - "\u001b[?25hRequirement already satisfied: numpy<2,>=1.10 in /usr/lib64/python3.11/site-packages (from genomeinfo==0.1.0) (1.24.3)\n", - "Requirement already satisfied: pandas>=1.3 in /home/alejandro/.local/lib/python3.11/site-packages (from genomeinfo==0.1.0) (1.5.3)\n", - "Requirement already satisfied: pyarrow>=5.0 in /home/alejandro/.local/lib/python3.11/site-packages (from genomeinfo==0.1.0) (16.1.0)\n", - "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/lib/python3.11/site-packages (from pandas>=1.3->genomeinfo==0.1.0) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /usr/lib/python3.11/site-packages (from pandas>=1.3->genomeinfo==0.1.0) (2023.3.post1)\n", - "Requirement already satisfied: six>=1.5 in /usr/lib/python3.11/site-packages (from python-dateutil>=2.8.1->pandas>=1.3->genomeinfo==0.1.0) (1.16.0)\n", - "Building wheels for collected packages: genomeinfo\n", - " Building editable for genomeinfo (pyproject.toml) ... \u001b[?25ldone\n", - "\u001b[?25h Created wheel for genomeinfo: filename=genomeinfo-0.1.0-py2.py3-none-any.whl size=2327 sha256=53ce877682e61a6a75f45e4398587bc38f6925a3e3849f59b4be67452f75ddcc\n", - " Stored in directory: /tmp/pip-ephem-wheel-cache-pujs6iy3/wheels/4e/cb/e8/0baa7aa991848767127e29ea4738849dea6d2c9bd867edb942\n", - "Successfully built genomeinfo\n", - "Installing collected packages: genomeinfo\n", - " Attempting uninstall: genomeinfo\n", - " Found existing installation: genomeinfo 0.1.0\n", - " Uninstalling genomeinfo-0.1.0:\n", - " Successfully uninstalled genomeinfo-0.1.0\n", - "Successfully installed genomeinfo-0.1.0\n" + "Requirement already satisfied: assemblyinfo in /home/alejandro/.local/lib/python3.11/site-packages (0.0.1)\n", + "Requirement already satisfied: numpy<2,>=1.10 in /usr/lib64/python3.11/site-packages (from assemblyinfo) (1.24.3)\n", + "Requirement already satisfied: pandas>=1.3 in /home/alejandro/.local/lib/python3.11/site-packages (from assemblyinfo) (1.5.3)\n", + "Requirement already satisfied: pyarrow>=5.0 in /home/alejandro/.local/lib/python3.11/site-packages (from assemblyinfo) (16.1.0)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/lib/python3.11/site-packages (from pandas>=1.3->assemblyinfo) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/lib/python3.11/site-packages (from pandas>=1.3->assemblyinfo) (2023.3.post1)\n", + "Requirement already satisfied: six>=1.5 in /usr/lib/python3.11/site-packages (from python-dateutil>=2.8.1->pandas>=1.3->assemblyinfo) (1.16.0)\n" ] } ], "source": [ - "# Assumming you are running this from your computer\n", - "# At this moment, GenomeInfo is not available in pypi yet\n", - "\n", - "%cd ../\n", - "!pip3 install -e ." + "!pip3 install assemblyinfo" ] }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 3, "id": "e984df29-5550-49e1-8e0e-57871e4e409f", "metadata": {}, "outputs": [], @@ -72,47 +46,36 @@ "\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", - "from genomeinfo import GenomeInfo" + "import assemblyinfo" ] }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 4, "id": "c873247a-98d9-41e9-bd5e-6b0f2537b660", "metadata": {}, "outputs": [], "source": [ "# use the connect() method to retrieve our database!\n", "\n", - "db = GenomeInfo.connect()" + "db = assemblyinfo.connect()" ] }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 8, "id": "ebb9df1d-dad9-45fd-973b-db76dd11df50", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Genome Information:\n", - "===================\n", - "Species:\n", - " - caenorhabditis_elegans, homo_sapiens, mus_musculus, drosophila_melanogaster, danio_rerio, bos_taurus, gallus_gallus, canis_lupus_familiaris\n", - "\n", - "Common Names:\n", - " - celegans, human, mouse, fruitfly, zebrafish, cow, chicken, dog\n", - "\n", - "Assemblies (UCSC):\n", - " - ce11, ce6, hg19, hg38, hg17, hg18, mm9, mm10, mm39, mm6, mm7, mm8, dm3, dm6, danRer6, danRer7, danRer10, danRer11, danRer5, bosTau9, galGal7, canFam4, hs1, canFam2, canFam3, canFam6, ROS_Cfam_1.0, galGal3, galGal4, galGal5, galGal6, canFam5\n", - "\n", - "Assemblies (NCBI):\n", - " - WS144, WBcel215, WBcel235, WS190, WS195, GRCh37, GRCh38, NCBI35, NCBI36, MGSCv37, GRCm38, GRCm39, MGSCv34, MGSCv35, MGSCv36, Release_5, Release_6, Release_6_plus_ISO1_MT, Zv8, Zv9, GRCz10, GRCz11, Zv7, ARS-UCD1.1, ARS-UCD1.2, ARS-UCD1.3, ARS-UCD2.0, bGalGal1.mat.broiler.GRCg7b, UU_Cfam_GSD_1.0, T2T-CHM13, ASM3317019v1, ASM3317019v2, CanFam2.0, CanFam3.1, Dog10K_Boxer_Tasha, ROS_Cfam_1.0, Gallus_gallus-2.1, Gallus_gallus-4.0, Gallus_gallus-5.0, GRCg6, GRCg6a, UMICH_Zoey_3.1, ASM2820141v1\n", - "\n", - "Please pick an entry and retrieve your desired data!\n" - ] + "data": { + "text/plain": [ + "'Genome Information:\\n===================\\nSpecies:\\n - caenorhabditis_elegans, homo_sapiens, mus_musculus, drosophila_melanogaster, danio_rerio, bos_taurus, gallus_gallus, canis_lupus_familiaris\\n\\nCommon Names:\\n - celegans, human, mouse, fruitfly, zebrafish, cow, chicken, dog\\n\\nAssemblies (UCSC):\\n - ce11, ce6, hg19, hg38, hg17, hg18, mm9, mm10, mm39, mm6, mm7, mm8, dm3, dm6, danRer6, danRer7, danRer10, danRer11, danRer5, bosTau9, galGal7, canFam4, hs1, canFam2, canFam3, canFam6, ROS_Cfam_1.0, galGal3, galGal4, galGal5, galGal6, canFam5\\n\\nAssemblies (NCBI):\\n - WS144, WBcel215, WBcel235, WS190, WS195, GRCh37, GRCh38, NCBI35, NCBI36, MGSCv37, GRCm38, GRCm39, MGSCv34, MGSCv35, MGSCv36, Release_5, Release_6, Release_6_plus_ISO1_MT, Zv8, Zv9, GRCz10, GRCz11, Zv7, ARS-UCD1.1, ARS-UCD1.2, ARS-UCD1.3, ARS-UCD2.0, bGalGal1.mat.broiler.GRCg7b, UU_Cfam_GSD_1.0, T2T-CHM13, ASM3317019v1, ASM3317019v2, CanFam2.0, CanFam3.1, Dog10K_Boxer_Tasha, ROS_Cfam_1.0, Gallus_gallus-2.1, Gallus_gallus-4.0, Gallus_gallus-5.0, GRCg6, GRCg6a, UMICH_Zoey_3.1, ASM2820141v1\\n\\nPlease pick an entry and retrieve your desired data!'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -123,68 +86,37 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 15, "id": "50d5bbfe-df27-44d7-9c7b-445822a2cace", "metadata": {}, "outputs": [], "source": [ - "# for this example, we are going to use the dog as our model!\n", + "# for this example, we are going to use one of the dog assemblies as our model!\n", "\n", - "df = db.get_db().set_index(\"common_name\").loc[\"dog\",:].reset_index()" + "df = db.assembly_info(\"canFam6\", roles=[\"assembled\"], units=[\"primary\"])" ] }, { "cell_type": "code", - "execution_count": 50, - "id": "4277e73e-a9e8-48df-a197-1ff2d958ebe6", + "execution_count": 26, + "id": "4df1036d-4ca5-4577-8157-a08d4e3b68c9", "metadata": {}, "outputs": [], "source": [ - "# here, we extract the scaffold-N50 in kb for each dog assembly\n", - "\n", - "n50 = [\n", - " (df.loc[idx, \"assembly\"], df.loc[idx, \"metadata\"].get('scaffold-N50')/1000)\n", - " for idx in df.index\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "id": "67bb18e1-f7e9-4914-aa80-479a4935afce", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[('UU_Cfam_GSD_1.0', 64299.765),\n", - " ('CanFam2.0', 45337.677),\n", - " ('CanFam3.1', 45876.61),\n", - " ('Dog10K_Boxer_Tasha', 63738.581),\n", - " ('ROS_Cfam_1.0', 64037.277),\n", - " ('UMICH_Zoey_3.1', 64204.256)]" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# let's have a look!\n", + "# we will extract the ungapped length of each assembled and primary chromosomes\n", "\n", - "n50" + "ungapped_lengths = df.seqinfo[\"all-ungapped-length\"]" ] }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 28, "id": "97fb5b8f-26b0-415a-828c-800e4b9a0e12", "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "\n", "text/plain": [ "
" ] @@ -199,9 +131,8 @@ "# now, we are going to plot it!\n", "\n", "ax = (\n", - " pd.DataFrame(n50, columns=['assembly', 'n50'])\n", - " .set_index('assembly')\n", - " .plot(kind='bar', legend=False, title=\"Comparing Dog assemblies\")\n", + " ungapped_lengths\n", + " .plot(kind='bar', legend=False, title=\"Comparing unagpped lengths across chromosomes\")\n", ")\n", "\n", "plt.xticks(rotation=45, ha='right')\n", diff --git a/docs/tutorials/retrieve_chromosome_data.ipynb b/docs/tutorials/retrieve_chromosome_data.ipynb index f61c872..2139932 100644 --- a/docs/tutorials/retrieve_chromosome_data.ipynb +++ b/docs/tutorials/retrieve_chromosome_data.ipynb @@ -7,7 +7,7 @@ "source": [ "# Retrieve chromosome data using GenomeInfo!\n", "\n", - "Using GenomeInfo, we will retrieve the chromosome sizes from mm9!" + "Using AssemblyInfo, we will retrieve the chromosome sizes from mm9!" ] }, { @@ -15,10 +15,6 @@ "execution_count": 1, "id": "d9fb241d-6729-49f0-85a2-e5e4be69493c", "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - }, "scrolled": true }, "outputs": [ @@ -26,40 +22,19 @@ "name": "stdout", "output_type": "stream", "text": [ - "/home/alejandro/Documents/projects/forks/genomeinfo\n", "Defaulting to user installation because normal site-packages is not writeable\n", - "Obtaining file:///home/alejandro/Documents/projects/forks/genomeinfo\n", - " Installing build dependencies ... \u001b[?25ldone\n", - "\u001b[?25h Checking if build backend supports build_editable ... \u001b[?25ldone\n", - "\u001b[?25h Getting requirements to build editable ... \u001b[?25ldone\n", - "\u001b[?25h Installing backend dependencies ... \u001b[?25ldone\n", - "\u001b[?25h Preparing editable metadata (pyproject.toml) ... \u001b[?25ldone\n", - "\u001b[?25hRequirement already satisfied: numpy<2,>=1.10 in /usr/lib64/python3.11/site-packages (from genomeinfo==0.1.0) (1.24.3)\n", - "Requirement already satisfied: pandas>=1.3 in /home/alejandro/.local/lib/python3.11/site-packages (from genomeinfo==0.1.0) (1.5.3)\n", - "Requirement already satisfied: pyarrow>=5.0 in /home/alejandro/.local/lib/python3.11/site-packages (from genomeinfo==0.1.0) (16.1.0)\n", - "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/lib/python3.11/site-packages (from pandas>=1.3->genomeinfo==0.1.0) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /usr/lib/python3.11/site-packages (from pandas>=1.3->genomeinfo==0.1.0) (2023.3.post1)\n", - "Requirement already satisfied: six>=1.5 in /usr/lib/python3.11/site-packages (from python-dateutil>=2.8.1->pandas>=1.3->genomeinfo==0.1.0) (1.16.0)\n", - "Building wheels for collected packages: genomeinfo\n", - " Building editable for genomeinfo (pyproject.toml) ... \u001b[?25ldone\n", - "\u001b[?25h Created wheel for genomeinfo: filename=genomeinfo-0.1.0-py2.py3-none-any.whl size=2327 sha256=53ce877682e61a6a75f45e4398587bc38f6925a3e3849f59b4be67452f75ddcc\n", - " Stored in directory: /tmp/pip-ephem-wheel-cache-xtzikhdz/wheels/4e/cb/e8/0baa7aa991848767127e29ea4738849dea6d2c9bd867edb942\n", - "Successfully built genomeinfo\n", - "Installing collected packages: genomeinfo\n", - " Attempting uninstall: genomeinfo\n", - " Found existing installation: genomeinfo 0.1.0\n", - " Uninstalling genomeinfo-0.1.0:\n", - " Successfully uninstalled genomeinfo-0.1.0\n", - "Successfully installed genomeinfo-0.1.0\n" + "Requirement already satisfied: assemblyinfo in /home/alejandro/.local/lib/python3.11/site-packages (0.0.1)\n", + "Requirement already satisfied: numpy<2,>=1.10 in /usr/lib64/python3.11/site-packages (from assemblyinfo) (1.24.3)\n", + "Requirement already satisfied: pandas>=1.3 in /home/alejandro/.local/lib/python3.11/site-packages (from assemblyinfo) (1.5.3)\n", + "Requirement already satisfied: pyarrow>=5.0 in /home/alejandro/.local/lib/python3.11/site-packages (from assemblyinfo) (16.1.0)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/lib/python3.11/site-packages (from pandas>=1.3->assemblyinfo) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/lib/python3.11/site-packages (from pandas>=1.3->assemblyinfo) (2023.3.post1)\n", + "Requirement already satisfied: six>=1.5 in /usr/lib/python3.11/site-packages (from python-dateutil>=2.8.1->pandas>=1.3->assemblyinfo) (1.16.0)\n" ] } ], "source": [ - "# Assumming you are running this from your computer\n", - "# At this moment, GenomeInfo is not available in pypi yet\n", - "\n", - "%cd ../\n", - "!pip3 install -e ." + "!pip3 install assemblyinfo" ] }, { @@ -69,9 +44,7 @@ "metadata": {}, "outputs": [], "source": [ - "# import GenomeInfo from the package!\n", - "\n", - "from genomeinfo import GenomeInfo" + "import assemblyinfo" ] }, { @@ -83,61 +56,124 @@ "source": [ "# use the connect() method to retrieve our database!\n", "\n", - "db = GenomeInfo.connect()" + "db = assemblyinfo.connect()" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "184c2be8-2419-42c5-8aa6-6479a8d33baa", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Genome Information:\n", - "===================\n", - "Species:\n", - " - caenorhabditis_elegans, homo_sapiens, mus_musculus, drosophila_melanogaster, danio_rerio, bos_taurus, gallus_gallus, canis_lupus_familiaris\n", - "\n", - "Common Names:\n", - " - celegans, human, mouse, fruitfly, zebrafish, cow, chicken, dog\n", - "\n", - "Assemblies (UCSC):\n", - " - ce11, ce6, hg19, hg38, hg17, hg18, mm9, mm10, mm39, mm6, mm7, mm8, dm3, dm6, danRer6, danRer7, danRer10, danRer11, danRer5, bosTau9, galGal7, canFam4, hs1, canFam2, canFam3, canFam6, ROS_Cfam_1.0, galGal3, galGal4, galGal5, galGal6, canFam5\n", - "\n", - "Assemblies (NCBI):\n", - " - WS144, WBcel215, WBcel235, WS190, WS195, GRCh37, GRCh38, NCBI35, NCBI36, MGSCv37, GRCm38, GRCm39, MGSCv34, MGSCv35, MGSCv36, Release_5, Release_6, Release_6_plus_ISO1_MT, Zv8, Zv9, GRCz10, GRCz11, Zv7, ARS-UCD1.1, ARS-UCD1.2, ARS-UCD1.3, ARS-UCD2.0, bGalGal1.mat.broiler.GRCg7b, UU_Cfam_GSD_1.0, T2T-CHM13, ASM3317019v1, ASM3317019v2, CanFam2.0, CanFam3.1, Dog10K_Boxer_Tasha, ROS_Cfam_1.0, Gallus_gallus-2.1, Gallus_gallus-4.0, Gallus_gallus-5.0, GRCg6, GRCg6a, UMICH_Zoey_3.1, ASM2820141v1\n", - "\n", - "Please pick an entry and retrieve your desired data!\n" - ] + "data": { + "text/plain": [ + "['WS144',\n", + " 'WBcel215',\n", + " 'WBcel235',\n", + " 'WS190',\n", + " 'WS195',\n", + " 'GRCh37',\n", + " 'GRCh38',\n", + " 'NCBI35',\n", + " 'NCBI36',\n", + " 'MGSCv37',\n", + " 'GRCm38',\n", + " 'GRCm39',\n", + " 'MGSCv34',\n", + " 'MGSCv35',\n", + " 'MGSCv36',\n", + " 'Release_5',\n", + " 'Release_6',\n", + " 'Release_6_plus_ISO1_MT',\n", + " 'Zv8',\n", + " 'Zv9',\n", + " 'GRCz10',\n", + " 'GRCz11',\n", + " 'Zv7',\n", + " 'ARS-UCD1.1',\n", + " 'ARS-UCD1.2',\n", + " 'ARS-UCD1.3',\n", + " 'ARS-UCD2.0',\n", + " 'bGalGal1.mat.broiler.GRCg7b',\n", + " 'UU_Cfam_GSD_1.0',\n", + " 'T2T-CHM13',\n", + " 'ASM3317019v1',\n", + " 'ASM3317019v2',\n", + " 'CanFam2.0',\n", + " 'CanFam3.1',\n", + " 'Dog10K_Boxer_Tasha',\n", + " 'ROS_Cfam_1.0',\n", + " 'Gallus_gallus-2.1',\n", + " 'Gallus_gallus-4.0',\n", + " 'Gallus_gallus-5.0',\n", + " 'GRCg6',\n", + " 'GRCg6a',\n", + " 'UMICH_Zoey_3.1',\n", + " 'ASM2820141v1',\n", + " ,\n", + " 'ce11',\n", + " 'ce6',\n", + " 'hg19',\n", + " 'hg38',\n", + " 'hg17',\n", + " 'hg18',\n", + " 'mm9',\n", + " 'mm10',\n", + " 'mm39',\n", + " 'mm6',\n", + " 'mm7',\n", + " 'mm8',\n", + " 'dm3',\n", + " 'dm6',\n", + " 'danRer6',\n", + " 'danRer7',\n", + " 'danRer10',\n", + " 'danRer11',\n", + " 'danRer5',\n", + " 'bosTau9',\n", + " 'galGal7',\n", + " 'canFam4',\n", + " 'hs1',\n", + " 'canFam2',\n", + " 'canFam3',\n", + " 'canFam6',\n", + " 'ROS_Cfam_1.0',\n", + " 'galGal3',\n", + " 'galGal4',\n", + " 'galGal5',\n", + " 'galGal6',\n", + " 'canFam5']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "# you can easily see what's inside the database by just running .info()\n", + "# you can easily see whhich assemblies are available by running:\n", "\n", - "db.info()" + "db.available_assemblies()" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "fd2eb51a-99fe-4050-86f7-deddc6161db2", "metadata": {}, "outputs": [], "source": [ - "# to get the chromosome sizes you just need to use .get_chromsizes()\n", + "# to get the chromosome sizes you just need to use extract the data using 'assembly_info()'\n", "# here, we are specifying that we want only assembled chromosomes to bypass\n", "# scaffolds or unplaced sequences\n", "\n", - "chromsizes = db.get_chromsizes('mm9', roles=[\"assembled\"])\n", - "chromosomes = list(chromsizes.index)" + "mm9 = db.assembly_info('mm9', roles=[\"assembled\"])" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "id": "7c4e2b61-6afd-4df3-817f-63a76b70bde8", "metadata": {}, "outputs": [ @@ -170,7 +206,7 @@ "Name: length, dtype: Int64" ] }, - "execution_count": 6, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -178,49 +214,7 @@ "source": [ "# let's check out the result!\n", "\n", - "chromsizes" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "73139067-d320-4803-b2c4-8a44673018b7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['chr1',\n", - " 'chr2',\n", - " 'chr3',\n", - " 'chr4',\n", - " 'chr5',\n", - " 'chr6',\n", - " 'chr7',\n", - " 'chr8',\n", - " 'chr9',\n", - " 'chr10',\n", - " 'chr11',\n", - " 'chr12',\n", - " 'chr13',\n", - " 'chr14',\n", - " 'chr15',\n", - " 'chr16',\n", - " 'chr17',\n", - " 'chr18',\n", - " 'chr19',\n", - " 'chrX',\n", - " 'chrY',\n", - " 'chrM']" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chromosomes" + "mm9.chromsizes" ] } ], diff --git a/docs/tutorials/translate_chromosome_names.ipynb b/docs/tutorials/translate_chromosome_names.ipynb index 8737b1f..dc4c7c5 100644 --- a/docs/tutorials/translate_chromosome_names.ipynb +++ b/docs/tutorials/translate_chromosome_names.ipynb @@ -20,40 +20,19 @@ "name": "stdout", "output_type": "stream", "text": [ - "/home/alejandro/Documents/projects/forks/genomeinfo\n", "Defaulting to user installation because normal site-packages is not writeable\n", - "Obtaining file:///home/alejandro/Documents/projects/forks/genomeinfo\n", - " Installing build dependencies ... \u001b[?25ldone\n", - "\u001b[?25h Checking if build backend supports build_editable ... \u001b[?25ldone\n", - "\u001b[?25h Getting requirements to build editable ... \u001b[?25ldone\n", - "\u001b[?25h Installing backend dependencies ... \u001b[?25ldone\n", - "\u001b[?25h Preparing editable metadata (pyproject.toml) ... \u001b[?25ldone\n", - "\u001b[?25hRequirement already satisfied: numpy<2,>=1.10 in /usr/lib64/python3.11/site-packages (from genomeinfo==0.1.0) (1.24.3)\n", - "Requirement already satisfied: pandas>=1.3 in /home/alejandro/.local/lib/python3.11/site-packages (from genomeinfo==0.1.0) (1.5.3)\n", - "Requirement already satisfied: pyarrow>=5.0 in /home/alejandro/.local/lib/python3.11/site-packages (from genomeinfo==0.1.0) (16.1.0)\n", - "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/lib/python3.11/site-packages (from pandas>=1.3->genomeinfo==0.1.0) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /usr/lib/python3.11/site-packages (from pandas>=1.3->genomeinfo==0.1.0) (2023.3.post1)\n", - "Requirement already satisfied: six>=1.5 in /usr/lib/python3.11/site-packages (from python-dateutil>=2.8.1->pandas>=1.3->genomeinfo==0.1.0) (1.16.0)\n", - "Building wheels for collected packages: genomeinfo\n", - " Building editable for genomeinfo (pyproject.toml) ... \u001b[?25ldone\n", - "\u001b[?25h Created wheel for genomeinfo: filename=genomeinfo-0.1.0-py2.py3-none-any.whl size=2327 sha256=53ce877682e61a6a75f45e4398587bc38f6925a3e3849f59b4be67452f75ddcc\n", - " Stored in directory: /tmp/pip-ephem-wheel-cache-oyr80cjs/wheels/4e/cb/e8/0baa7aa991848767127e29ea4738849dea6d2c9bd867edb942\n", - "Successfully built genomeinfo\n", - "Installing collected packages: genomeinfo\n", - " Attempting uninstall: genomeinfo\n", - " Found existing installation: genomeinfo 0.1.0\n", - " Uninstalling genomeinfo-0.1.0:\n", - " Successfully uninstalled genomeinfo-0.1.0\n", - "Successfully installed genomeinfo-0.1.0\n" + "Requirement already satisfied: assemblyinfo in /home/alejandro/.local/lib/python3.11/site-packages (0.0.1)\n", + "Requirement already satisfied: numpy<2,>=1.10 in /usr/lib64/python3.11/site-packages (from assemblyinfo) (1.24.3)\n", + "Requirement already satisfied: pandas>=1.3 in /home/alejandro/.local/lib/python3.11/site-packages (from assemblyinfo) (1.5.3)\n", + "Requirement already satisfied: pyarrow>=5.0 in /home/alejandro/.local/lib/python3.11/site-packages (from assemblyinfo) (16.1.0)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/lib/python3.11/site-packages (from pandas>=1.3->assemblyinfo) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/lib/python3.11/site-packages (from pandas>=1.3->assemblyinfo) (2023.3.post1)\n", + "Requirement already satisfied: six>=1.5 in /usr/lib/python3.11/site-packages (from python-dateutil>=2.8.1->pandas>=1.3->assemblyinfo) (1.16.0)\n" ] } ], "source": [ - "# Assumming you are running this from your computer\n", - "# At this moment, GenomeInfo is not available in pypi yet\n", - "\n", - "%cd ../\n", - "!pip3 install -e ." + "!pip3 install assemblyinfo" ] }, { @@ -63,9 +42,7 @@ "metadata": {}, "outputs": [], "source": [ - "# import GenomeInfo from the package!\n", - "\n", - "from genomeinfo import GenomeInfo" + "import assemblyinfo" ] }, { @@ -77,46 +54,35 @@ "source": [ "# use the connect() method to retrieve our database!\n", "\n", - "db = GenomeInfo.connect()" + "db = assemblyinfo.connect()" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "6dc5c5dd-fcba-445e-ba90-853a429340c1", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Genome Information:\n", - "===================\n", - "Species:\n", - " - caenorhabditis_elegans, homo_sapiens, mus_musculus, drosophila_melanogaster, danio_rerio, bos_taurus, gallus_gallus, canis_lupus_familiaris\n", - "\n", - "Common Names:\n", - " - celegans, human, mouse, fruitfly, zebrafish, cow, chicken, dog\n", - "\n", - "Assemblies (UCSC):\n", - " - ce11, ce6, hg19, hg38, hg17, hg18, mm9, mm10, mm39, mm6, mm7, mm8, dm3, dm6, danRer6, danRer7, danRer10, danRer11, danRer5, bosTau9, galGal7, canFam4, hs1, canFam2, canFam3, canFam6, ROS_Cfam_1.0, galGal3, galGal4, galGal5, galGal6, canFam5\n", - "\n", - "Assemblies (NCBI):\n", - " - WS144, WBcel215, WBcel235, WS190, WS195, GRCh37, GRCh38, NCBI35, NCBI36, MGSCv37, GRCm38, GRCm39, MGSCv34, MGSCv35, MGSCv36, Release_5, Release_6, Release_6_plus_ISO1_MT, Zv8, Zv9, GRCz10, GRCz11, Zv7, ARS-UCD1.1, ARS-UCD1.2, ARS-UCD1.3, ARS-UCD2.0, bGalGal1.mat.broiler.GRCg7b, UU_Cfam_GSD_1.0, T2T-CHM13, ASM3317019v1, ASM3317019v2, CanFam2.0, CanFam3.1, Dog10K_Boxer_Tasha, ROS_Cfam_1.0, Gallus_gallus-2.1, Gallus_gallus-4.0, Gallus_gallus-5.0, GRCg6, GRCg6a, UMICH_Zoey_3.1, ASM2820141v1\n", - "\n", - "Please pick an entry and retrieve your desired data!\n" - ] + "data": { + "text/plain": [ + "'Genome Information for homo_sapiens:\\n===================\\nCommon Names:\\n - human\\n\\nAssemblies (UCSC):\\n - hg19, hg38, hg17, hg18, hs1\\n\\nAssemblies (NCBI):\\n - GRCh37, GRCh38, NCBI35, NCBI36, T2T-CHM13\\n\\n'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "# you can easily see what's inside the database by just running .info()\n", + "# you can easily see what assemblies are available for each species using 'get_species_info()'\n", "\n", - "db.info()" + "db.get_species_info(\"homo_sapiens\")" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 10, "id": "42c7b9fc-0312-4f25-b2c3-d4f166c2a03b", "metadata": {}, "outputs": [ @@ -150,20 +116,21 @@ " 'MT']" ] }, - "execution_count": 5, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# if you only need GenomeInfo as wrapper to extract names of assembled chromosomes:\n", + "# if you only need AssemblyInfo as wrapper to extract names of assembled chromosomes:\n", "\n", - "db.get_chrom_eq(\"hg38\", roles=[\"assembled\"]).ncbi.tolist()" + "hg38 = db.assembly_info(\"hg38\", roles=[\"assembled\"], provider=\"ncbi\")\n", + "hg38.chromnames" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 11, "id": "5cf13f48-4f7e-41dd-b7bf-414cfaff1eaf", "metadata": {}, "outputs": [ @@ -194,7 +161,7 @@ " 'MSCHRUN_CTG23']" ] }, - "execution_count": 6, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -202,12 +169,13 @@ "source": [ "# what if (for some strange reason) we need the names of unplaced sequences in mm10?\n", "\n", - "db.get_chrom_eq(\"mm10\", roles=[\"unplaced\"]).ncbi.tolist()" + "mm10 = db.assembly_info(\"mm10\", roles=[\"unplaced\"], provider=\"ncbi\")\n", + "mm10.chromnames" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 12, "id": "88927824-be4d-404f-b754-32eac4d3a65b", "metadata": {}, "outputs": [ @@ -260,7 +228,7 @@ " 'chrUn_JH584304']" ] }, - "execution_count": 7, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -268,7 +236,8 @@ "source": [ "# now unplace and unlocalized sequences, but in UCSC format?\n", "\n", - "db.get_chrom_eq(\"mm10\", roles=[\"unplaced\", \"unlocalized\"]).name.tolist()" + "mm10 = db.assembly_info(\"mm10\", roles=[\"unplaced\", \"unlocalized\"], provider=\"ucsc\")\n", + "mm10.chromnames" ] } ],