From 612b3fa3100df83622daf8c41c49e42b0f3bf71e Mon Sep 17 00:00:00 2001 From: nicolaasuni Date: Tue, 27 Nov 2018 09:42:14 +0000 Subject: [PATCH] Update RegionKey documentation --- README.md | 53 +++++++++++++++++++++++++++++++----- VERSION | 2 +- c/doc/Doxyfile | 2 +- c/resources/debian/control | 2 +- c/resources/rpm/rpm.spec | 2 +- c/src/variantkey/regionkey.h | 2 +- conda/c.src/meta.yaml | 2 +- conda/c.vk/meta.yaml | 2 +- conda/python-class/meta.yaml | 8 +++--- conda/python/meta.yaml | 2 +- conda/r/meta.yaml | 2 +- go/src/variantkey.go | 4 +-- python-class/setup.py | 4 +-- python/setup.py | 2 +- r/variantkey/DESCRIPTION | 2 +- 15 files changed, 65 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index e3f0cd8..238f1b8 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,7 @@ Nicola Asuni. [VariantKey - A Reversible Numerical Representation of Human Genet * [VariantKey Properties](#vkproperties) * [VariantKey Input values](#vkinput) * **[RegionKey](#regionkey)** + * [RegionKey Properties](#rkproperties) * [Encoding String IDs](#esid) * [Binary file formats for lookup tables](#binaryfiles) * [C Library](#clib) @@ -57,7 +58,7 @@ The [VariantKey Format](#vkformat) doesn't represent universal codes, it only en This software library can be used to generate and reverse [VariantKey](#vkformat)s and [RegionKey](#regionkey)s. - +---------- ## Quick Start @@ -98,6 +99,8 @@ cd c make test ``` +---------- + ## Human Genetic Variant Definition @@ -421,11 +424,12 @@ Normalized variant | 19 | 29238771 | C | G * **`ALT`** - *alternate non-reference allele* : String containing a sequence of [nucleotide letters](https://en.wikipedia.org/wiki/Nucleic_acid_notation). +---------- ## RegionKey -*RegionKey* encodes a human genetic region (defined as the set of *chromosome*, *start position*, *end position* and *strand direction*) in a 64 bit unsigned integer number. +*RegionKey* encodes a human genomic region (defined as the set of *chromosome*, *start position*, *end position* and *strand direction*) in a 64 bit unsigned integer number. RegionKey allows to repesent a region as a single entity, and provides analogous properties as the ones listed in [VariantKey Properties](#vkproperties). @@ -442,7 +446,24 @@ The RegionKey is composed of 4 sections arranged in 64 bit: STRAND ``` -* **`CHROM`** : 5 bit to represent the chromosome. + +Example of RegionKey encoding: + +``` + | CHROM | STARTPOS | ENDPOS | STRAND | +------------------+-------+------------------------------+------------------------------+--------+ + Raw variant | chr19 | 29238771 | 29239026 | +1 | +Normalized region | 19 | 29238771 | 29239026 | +1 | +------------------+-------+------------------------------+------------------------------+--------+ + RegionKey bin | 10011 | 0001101111100010010111110011 | 0001101111100010011011110010 | 01 0 | +------------------+-------+------------------------------+---------------------------------------+ + RegionKey hex | 98DF12F98DF13792 | + RegionKey dec | 11015544076609075090 | +------------------+------------------------------------------------------------------------------+ +``` + +* **`CHROM`** : 5 bit to represent the chromosome. + An identifier from the reference genome. It only has 26 valid values: autosomes from 1 to 22, the sex chromosomes X=23 and Y=24, mitochondria MT=25 and a symbol NA=0 to indicate an invalid value. ``` 0 4 @@ -458,7 +479,8 @@ The RegionKey is composed of 4 sections arranged in 64 bit: The chromosome is encoded as in VariantKey. -* **`STARTPOS`** : 28 bit for the region START position. +* **`STARTPOS`** : 28 bit for the region START position. + The region start position in the chromosome, with the first base having position 0. The largest expected value is less than 250 million to represent the last base pair in Chromosome 1. ``` 0 5 32 63 @@ -474,7 +496,8 @@ The RegionKey is composed of 4 sections arranged in 64 bit: This section is encoded as in VariantKey POS. -* **`ENDPOS`** : 28 bit for the region END position. +* **`ENDPOS`** : 28 bit for the region END position. + The region end position in the chromosome. The end position is equivalent to (STARTPOS + REGION_LENGTH), such that the base having position ENDPOS is not included in the region. ``` 0 33 60 63 @@ -489,7 +512,8 @@ The RegionKey is composed of 4 sections arranged in 64 bit: ``` The end position is equivalent to (STARTPOS + REGION_LENGTH). -* **`STRAND`** : 2 bit to encode the strand direction. +* **`STRAND`** : 2 bit to encode the strand direction. + (optional) The direction of the DNA strand. This is useful when encoding genic regions. ``` 0 61 62 @@ -503,7 +527,7 @@ The RegionKey is composed of 4 sections arranged in 64 bit: ``` -1 : 2 dec = "10" bin = reverse (minus) strand direction - 0 : 0 dec = "00" bin = unknown strand direction + 0 : 0 dec = "00" bin = unknown or not applicable strand direction +1 : 1 dec = "01" bin = forward (plus) strand direction ``` @@ -512,6 +536,18 @@ The RegionKey is composed of 4 sections arranged in 64 bit: This software library provides several functions to operate with *RegionKey* and interact with *VariantKey*. + +### RegionKey Properties + +* It is compatible with VariantKey. +* It can be encoded and decoded on-the-fly. +* Sorting by RegionKey is equivalent of sorting by CHROM and STARTPOS. +* The 64 bit RegionKey can be exported as a single 16 character hexadecimal string. +* Sorting the hexadecimal representation of RegionKey in alphabetical order is equivalent of sorting the RegionKey numerically. +* RegionKey can be used as a main database key to index data by "region". This simplify common searching, merging and filtering operations. + +---------- + ## Encoding String IDs @@ -523,6 +559,7 @@ This library contains extra functions to encode some string IDs to 64 bit unsign * The `hash_string_id` function creates a 64 bit unsigned integer hash of the input string. +---------- ## Binary files for lookup tables @@ -565,6 +602,8 @@ https://sourceforge.net/projects/variantkey/files/ 1800c351f61f65d3 A AAGAAAGAAAG ``` +---------- + ## C Library diff --git a/VERSION b/VERSION index 8a30e8f..ade6522 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -5.4.0 +5.4.1 diff --git a/c/doc/Doxyfile b/c/doc/Doxyfile index de5e0ff..4e1973e 100644 --- a/c/doc/Doxyfile +++ b/c/doc/Doxyfile @@ -32,7 +32,7 @@ PROJECT_NAME = "VariantKey" # This could be handy for archiving the generated documentation or # if some version control system is used. -PROJECT_NUMBER = 5.4.0 +PROJECT_NUMBER = 5.4.1 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer diff --git a/c/resources/debian/control b/c/resources/debian/control index 1861045..426fa00 100644 --- a/c/resources/debian/control +++ b/c/resources/debian/control @@ -14,6 +14,6 @@ Depends: ${shlibs:Depends}, ${misc:Depends} Description: Numerical Encoding for Human Genetic Variants. Provides C header-only files for: VariantKey, a reversible numerical encoding schema for human genetic variants. - RegionKey, a reversible numerical encoding schema for human genetic regions. + RegionKey, a reversible numerical encoding schema for human genomic regions. ESID, a reversible numerical encoding schema for genetic string identifiers. diff --git a/c/resources/rpm/rpm.spec b/c/resources/rpm/rpm.spec index 7498228..53df21a 100644 --- a/c/resources/rpm/rpm.spec +++ b/c/resources/rpm/rpm.spec @@ -21,7 +21,7 @@ Provides: %{gh_project} = %{version} %description Provides C header-only files for: VariantKey, a reversible numerical encoding schema for human genetic variants. -RegionKey, a reversible numerical encoding schema for human genetic regions. +RegionKey, a reversible numerical encoding schema for human genomic regions. ESID, a reversible numerical encoding schema for genetic string identifiers. %build diff --git a/c/src/variantkey/regionkey.h b/c/src/variantkey/regionkey.h index f0f6b73..3c7e350 100644 --- a/c/src/variantkey/regionkey.h +++ b/c/src/variantkey/regionkey.h @@ -34,7 +34,7 @@ * @file regionkey.h * @brief RegionKey main functions. * - * The functions provided here allows to generate and process a 64 bit Unsigned Integer Keys for Human Genetic Regions. + * The functions provided here allows to generate and process a 64 bit Unsigned Integer Keys for Human Genomic Regions. * The RegionKey is sortable for chromosome and start position, and it is also fully reversible. */ diff --git a/conda/c.src/meta.yaml b/conda/c.src/meta.yaml index c4a0568..0bf8417 100644 --- a/conda/c.src/meta.yaml +++ b/conda/c.src/meta.yaml @@ -1,6 +1,6 @@ package: name: variantkey-src - version: 5.4.0 + version: 5.4.1 source: path: ../.. diff --git a/conda/c.vk/meta.yaml b/conda/c.vk/meta.yaml index ef81475..43bd1a3 100644 --- a/conda/c.vk/meta.yaml +++ b/conda/c.vk/meta.yaml @@ -1,6 +1,6 @@ package: name: variantkey-vk - version: 5.4.0 + version: 5.4.1 source: path: ../.. diff --git a/conda/python-class/meta.yaml b/conda/python-class/meta.yaml index 2647b6c..fa77b47 100644 --- a/conda/python-class/meta.yaml +++ b/conda/python-class/meta.yaml @@ -1,6 +1,6 @@ package: name: pyvariantkey - version: 5.4.0 + version: 5.4.1 source: path: ../.. @@ -14,11 +14,11 @@ requirements: - setuptools - numpy >=1.15.0 build: - - variantkey >=5.4.0 + - variantkey >=5.4.1 - numpy >=1.15.0 run: - python - - variantkey >=5.4.0 + - variantkey >=5.4.1 - numpy >=1.15.0 test: @@ -30,7 +30,7 @@ test: - pytest-cov - pytest-benchmark - pycodestyle - - variantkey >=5.4.0 + - variantkey >=5.4.1 - numpy >=1.15.0 imports: - pyvariantkey diff --git a/conda/python/meta.yaml b/conda/python/meta.yaml index 54f9f3c..43acb1a 100644 --- a/conda/python/meta.yaml +++ b/conda/python/meta.yaml @@ -1,6 +1,6 @@ package: name: variantkey - version: 5.4.0 + version: 5.4.1 source: path: ../.. diff --git a/conda/r/meta.yaml b/conda/r/meta.yaml index 7b50244..1c88a9e 100644 --- a/conda/r/meta.yaml +++ b/conda/r/meta.yaml @@ -1,6 +1,6 @@ package: name: r-variantkey - version: 5.4.0 + version: 5.4.1 source: path: ../.. diff --git a/go/src/variantkey.go b/go/src/variantkey.go index bef3a86..c2a69bc 100644 --- a/go/src/variantkey.go +++ b/go/src/variantkey.go @@ -611,7 +611,7 @@ func (mf TMMFile) NormalizedVariantKey(chrom string, pos uint32, posindex uint8, // --- REGIONKEY --- -// TRegionKey contains a representation of a genetic region key +// TRegionKey contains a representation of a genomic region key type TRegionKey struct { Chrom uint8 `json:"chrom"` StartPos uint32 `json:"startpos"` @@ -619,7 +619,7 @@ type TRegionKey struct { Strand uint8 `json:"strand"` } -// TRegionKeyRev contains a genetic region components +// TRegionKeyRev contains a genomic region components type TRegionKeyRev struct { Chrom string `json:"chrom"` StartPos uint32 `json:"startpos"` diff --git a/python-class/setup.py b/python-class/setup.py index ba741bd..1cdc596 100644 --- a/python-class/setup.py +++ b/python-class/setup.py @@ -33,7 +33,7 @@ def run(self): setup( name='pyvariantkey', - version='5.4.0.1', + version='5.4.1.1', keywords=('variantkey variant key genetic genomics'), description="VariantKey Python wrapper class", long_description=read('../README.md'), @@ -51,7 +51,7 @@ def run(self): ], install_requires=[ 'numpy>=1.15.0', - 'variantkey>=5.4.0.1', + 'variantkey>=5.4.1.1', ], extras_require={ 'test': [ diff --git a/python/setup.py b/python/setup.py index 2fadaa4..2613ca8 100644 --- a/python/setup.py +++ b/python/setup.py @@ -30,7 +30,7 @@ def run(self): setup( name='variantkey', - version='5.4.0.1', + version='5.4.1.1', keywords=('variantkey variant key genetic genomics'), description="VariantKey Bindings for Python", long_description=read('../README.md'), diff --git a/r/variantkey/DESCRIPTION b/r/variantkey/DESCRIPTION index f909814..3a5fc33 100644 --- a/r/variantkey/DESCRIPTION +++ b/r/variantkey/DESCRIPTION @@ -1,6 +1,6 @@ Package: variantkey Title: Genetic VariantKey -Version: 5.4.0.1 +Version: 5.4.1.1 Authors@R: person("Nicola", "Asuni", email = "info@genomicsplc.com", role = c("aut", "cre")) Description: Tools to generate and process a 64 bit Unsigned Integer Keys for Human Genetic Variants. The VariantKey is sortable for chromosome and position,