diff --git a/.github/workflows/clippy_check.yml b/.github/workflows/clippy_check.yml index e79c7a2..8162f89 100644 --- a/.github/workflows/clippy_check.yml +++ b/.github/workflows/clippy_check.yml @@ -1,6 +1,11 @@ -on: push name: Clippy check +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + # Make sure CI fails on all warnings, including Clippy lints env: RUSTFLAGS: "-Dwarnings" diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index c3ced2a..861b360 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -1,4 +1,4 @@ -name: coverage +name: Code coverage on: push: diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml new file mode 100644 index 0000000..0bdcb78 --- /dev/null +++ b/.github/workflows/pypi.yml @@ -0,0 +1,157 @@ +name: Upload to PyPI + +on: + release: + types: [published] + +permissions: + contents: read + +jobs: + linux: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: ubuntu-latest + target: x86_64 + - runner: ubuntu-latest + target: x86 + - runner: ubuntu-latest + target: aarch64 + - runner: ubuntu-latest + target: armv7 + - runner: ubuntu-latest + target: s390x + - runner: ubuntu-latest + target: ppc64le + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter --manifest-path ./pykmertools/Cargo.toml + sccache: 'true' + manylinux: auto + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-linux-${{ matrix.platform.target }} + path: dist + + musllinux: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: ubuntu-latest + target: x86_64 + - runner: ubuntu-latest + target: x86 + - runner: ubuntu-latest + target: aarch64 + - runner: ubuntu-latest + target: armv7 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter --manifest-path ./pykmertools/Cargo.toml + sccache: 'true' + manylinux: musllinux_1_2 + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-musllinux-${{ matrix.platform.target }} + path: dist + + windows: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: windows-latest + target: x64 + - runner: windows-latest + target: x86 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + architecture: ${{ matrix.platform.target }} + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter --manifest-path ./pykmertools/Cargo.toml + sccache: 'true' + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-windows-${{ matrix.platform.target }} + path: dist + + macos: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: macos-12 + target: x86_64 + - runner: macos-14 + target: aarch64 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter --manifest-path ./pykmertools/Cargo.toml + sccache: 'true' + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-macos-${{ matrix.platform.target }} + path: dist + + sdist: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Build sdist + uses: PyO3/maturin-action@v1 + with: + command: sdist + args: --out dist --manifest-path ./pykmertools/Cargo.toml + - name: Upload sdist + uses: actions/upload-artifact@v4 + with: + name: wheels-sdist + path: dist + + release: + name: Release + runs-on: ubuntu-latest + needs: [linux, musllinux, windows, macos, sdist] + steps: + - uses: actions/download-artifact@v4 + - name: Publish to PyPI + uses: PyO3/maturin-action@v1 + env: + MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} + with: + command: upload + args: --non-interactive --skip-existing wheels-*/* diff --git a/Cargo.lock b/Cargo.lock index bc1d043..d023ac8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -477,6 +477,12 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "indoc" +version = "2.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" + [[package]] name = "instant" version = "0.1.12" @@ -581,6 +587,15 @@ dependencies = [ "libc", ] +[[package]] +name = "memoffset" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" +dependencies = [ + "autocfg", +] + [[package]] name = "miniz_oxide" version = "0.7.2" @@ -706,6 +721,12 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + [[package]] name = "ordered-float" version = "3.9.2" @@ -752,6 +773,79 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "pykmertools" +version = "0.1.0" +dependencies = [ + "composition", + "kmer", + "pyo3", + "rayon", +] + +[[package]] +name = "pyo3" +version = "0.22.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "831e8e819a138c36e212f3af3fd9eeffed6bf1510a805af35b0edee5ffa59433" +dependencies = [ + "cfg-if", + "indoc", + "libc", + "memoffset", + "once_cell", + "portable-atomic", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.22.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e8730e591b14492a8945cdff32f089250b05f5accecf74aeddf9e8272ce1fa8" +dependencies = [ + "once_cell", + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.22.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e97e919d2df92eb88ca80a037969f44e5e70356559654962cbb3316d00300c6" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.22.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb57983022ad41f9e683a599f2fd13c3664d7063a3ac5714cae4b7bee7d3f206" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn 2.0.60", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.22.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec480c0c51ddec81019531705acac51bcdbeae563557c982aa8263bb96880372" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "pyo3-build-config", + "quote", + "syn 2.0.60", +] + [[package]] name = "quote" version = "1.0.36" @@ -1001,6 +1095,12 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "target-lexicon" +version = "0.12.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" + [[package]] name = "thiserror" version = "1.0.58" @@ -1045,6 +1145,12 @@ version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68f5e5f3158ecfd4b8ff6fe086db7c8467a2dfdac97fe420f2b7c4aa97af66d6" +[[package]] +name = "unindent" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" + [[package]] name = "utf8parse" version = "0.2.1" diff --git a/Cargo.toml b/Cargo.toml index cb2c517..11d15bb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,10 @@ [workspace] package.version = "0.1.0" package.edition = "2021" -package.authors = ["Anuradha Wickramarachchi anuradhawick@gmail.com", "Vijini Mallawaarachchi viji.mallawaarachchi@gmail.com"] +package.authors = ["Anuradha Wickramarachchi ", "Vijini Mallawaarachchi "] +package.description = "kmertools is a k-mer based feature extraction tool designed to support metagenomics and other bioinformatics analytics." +package.readme = "README.md" +package.license-file = "LICENSE" -members = ["composition", "coverage", "kmertools", "kmer", "ktio", "counter", "misc"] +members = ["composition", "coverage", "kmertools", "kmer", "ktio", "counter", "misc", "pykmertools"] resolver = "2" diff --git a/composition/Cargo.toml b/composition/Cargo.toml index 88ef689..d894d59 100644 --- a/composition/Cargo.toml +++ b/composition/Cargo.toml @@ -3,6 +3,7 @@ name = "composition" version.workspace = true edition.workspace = true authors.workspace = true +description.workspace = true [dependencies] kmer = { path = "../kmer" } @@ -12,3 +13,6 @@ rayon = "1.10.0" [lib] doctest = false + +[lints.rust] +unexpected_cfgs = { level = "warn", check-cfg = ["cfg(tarpaulin_include)"] } diff --git a/counter/Cargo.toml b/counter/Cargo.toml index 4330b08..91df802 100644 --- a/counter/Cargo.toml +++ b/counter/Cargo.toml @@ -3,6 +3,7 @@ name = "counter" version.workspace = true edition.workspace = true authors.workspace = true +description.workspace = true # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/coverage/Cargo.toml b/coverage/Cargo.toml index a4852b8..c45b6c2 100644 --- a/coverage/Cargo.toml +++ b/coverage/Cargo.toml @@ -3,6 +3,7 @@ name = "coverage" version.workspace = true edition.workspace = true authors.workspace = true +description.workspace = true # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/kmer/Cargo.toml b/kmer/Cargo.toml index 0b8c957..618d808 100644 --- a/kmer/Cargo.toml +++ b/kmer/Cargo.toml @@ -3,6 +3,7 @@ name = "kmer" version.workspace = true edition.workspace = true authors.workspace = true +description.workspace = true [dependencies] indicatif = "0.17.8" diff --git a/kmertools/Cargo.toml b/kmertools/Cargo.toml index 8425e2d..29f5c7f 100644 --- a/kmertools/Cargo.toml +++ b/kmertools/Cargo.toml @@ -3,6 +3,7 @@ name = "kmertools" version.workspace = true edition.workspace = true authors.workspace = true +description.workspace = true [dependencies] clap = { version = "4.5.4", features = ["derive"] } @@ -11,3 +12,6 @@ coverage = { path = "../coverage" } counter = { path = "../counter" } misc = { path = "../misc" } ktio = { path = "../ktio" } + +[lints.rust] +unexpected_cfgs = { level = "warn", check-cfg = ["cfg(tarpaulin_include)"] } diff --git a/ktio/Cargo.toml b/ktio/Cargo.toml index 1909adc..c1a6451 100644 --- a/ktio/Cargo.toml +++ b/ktio/Cargo.toml @@ -3,6 +3,7 @@ name = "ktio" version.workspace = true edition.workspace = true authors.workspace = true +description.workspace = true [dependencies] bio = "1.6.0" diff --git a/misc/Cargo.toml b/misc/Cargo.toml index ddf2404..381527a 100644 --- a/misc/Cargo.toml +++ b/misc/Cargo.toml @@ -3,6 +3,7 @@ name = "misc" version.workspace = true edition.workspace = true authors.workspace = true +description.workspace = true # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/pykmertools/.gitignore b/pykmertools/.gitignore new file mode 100644 index 0000000..c8f0442 --- /dev/null +++ b/pykmertools/.gitignore @@ -0,0 +1,72 @@ +/target + +# Byte-compiled / optimized / DLL files +__pycache__/ +.pytest_cache/ +*.py[cod] + +# C extensions +*.so + +# Distribution / packaging +.Python +.venv/ +env/ +bin/ +build/ +develop-eggs/ +dist/ +eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +include/ +man/ +venv/ +*.egg-info/ +.installed.cfg +*.egg + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt +pip-selfcheck.json + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.cache +nosetests.xml +coverage.xml + +# Translations +*.mo + +# Mr Developer +.mr.developer.cfg +.project +.pydevproject + +# Rope +.ropeproject + +# Django stuff: +*.log +*.pot + +.DS_Store + +# Sphinx documentation +docs/_build/ + +# PyCharm +.idea/ + +# VSCode +.vscode/ + +# Pyenv +.python-version diff --git a/pykmertools/Cargo.toml b/pykmertools/Cargo.toml new file mode 100644 index 0000000..426947c --- /dev/null +++ b/pykmertools/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "pykmertools" +version.workspace = true +edition.workspace = true +authors.workspace = true +description.workspace = true +readme.workspace = true +license-file.workspace = true + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[lib] +name = "pykmertools" +crate-type = ["cdylib"] + +[dependencies] +pyo3 = "0.22.0" +rayon = "1.10.0" +composition = { path = "../composition" } +kmer = { path = "../kmer" } diff --git a/pykmertools/pyproject.toml b/pykmertools/pyproject.toml new file mode 100644 index 0000000..7e98503 --- /dev/null +++ b/pykmertools/pyproject.toml @@ -0,0 +1,23 @@ +[build-system] +requires = ["maturin>=1.7,<2.0"] +build-backend = "maturin" + +[project] +name = "pykmertools" +requires-python = ">=3.9" +classifiers = [ + "Programming Language :: Rust", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dynamic = ["version", "description", "license", "readme"] +keywords = ["genomics", "bioinformatics"] + + +[project.urls] +Documentation = "https://github.com/anuradhawick/kmertools/wiki" +"Bug Tracker" = "https://github.com/anuradhawick/kmertools/issues" +"Source Code" = "https://github.com/anuradhawick/kmertools/" + +[tool.maturin] +features = ["pyo3/extension-module"] diff --git a/pykmertools/src/cgr.rs b/pykmertools/src/cgr.rs new file mode 100644 index 0000000..cc1d8d4 --- /dev/null +++ b/pykmertools/src/cgr.rs @@ -0,0 +1,88 @@ +use pyo3::{exceptions::PyValueError, prelude::*}; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; +use std::collections::HashMap; + +type Point = (f64, f64); + +fn cgr_maps(vecsize: f64) -> (Point, HashMap) { + let cgr_a: Point = (0.0, 0.0); + let cgr_t: Point = (vecsize, 0.0); + let cgr_g: Point = (vecsize, vecsize); + let cgr_c: Point = (0.0, vecsize); + let cgr_center: Point = (vecsize / 2.0, vecsize / 2.0); + + let cgr_dict: HashMap = [ + (b'A', cgr_a), // Adenine + (b'T', cgr_t), // Thymine + (b'G', cgr_g), // Guanine + (b'C', cgr_c), // Cytosine + (b'U', cgr_t), // Uracil (demethylated form of thymine) + (b'a', cgr_a), // Adenine + (b't', cgr_t), // Thymine + (b'g', cgr_g), // Guanine + (b'c', cgr_c), // Cytosine + (b'u', cgr_t), // Uracil/Thymine + ] + .iter() + .cloned() + .collect(); + + (cgr_center, cgr_dict) +} + +/// Computer for generating chaos game representation (cgr) +#[pyclass] +pub struct CgrComputer { + cgr_center: Point, + cgr_map: HashMap, +} + +#[pymethods] +impl CgrComputer { + /// Initialise the cgr counter + /// Attributes: + /// ksize (int): size of the k-mers to count + #[new] + #[pyo3(signature = (vecsize))] + fn new(vecsize: usize) -> Self { + let (cgr_center, cgr_map) = cgr_maps(vecsize as f64); + + Self { + cgr_center, + cgr_map, + } + } + + /// Generate the cgr + /// Attributes: + /// seq (str): sequence as a string + #[pyo3(signature = (seq))] + fn vectorise_one(&self, seq: String) -> PyResult> { + let mut cgr = Vec::with_capacity(seq.len()); + let mut cgr_marker = self.cgr_center; + + for s in seq.as_bytes().iter() { + if let Some(&cgr_corner) = self.cgr_map.get(s) { + cgr_marker = ( + (cgr_corner.0 + cgr_marker.0) / 2.0, + (cgr_corner.1 + cgr_marker.1) / 2.0, + ); + cgr.push(cgr_marker); + } else { + return Err(PyValueError::new_err("Bad nucleotide, unable to proceed")); + } + } + + Ok(cgr) + } + + /// Generate the cgrs + /// Attributes: + /// seq (list[str]): list of sequences + #[pyo3(signature = (seqs))] + fn vectorise_batch(&self, seqs: Vec) -> PyResult>> { + seqs.into_par_iter() + .map(|seq| self.vectorise_one(seq)) + .collect() + } +} diff --git a/pykmertools/src/cov.rs b/pykmertools/src/cov.rs new file mode 100644 index 0000000..666bbf0 --- /dev/null +++ b/pykmertools/src/cov.rs @@ -0,0 +1,13 @@ +use pyo3::prelude::*; + +#[pyclass] +pub struct CovComputer {} + +#[pymethods] +impl CovComputer { + #[new] + #[pyo3(signature = ())] + fn new() -> Self { + Self {} + } +} diff --git a/pykmertools/src/lib.rs b/pykmertools/src/lib.rs new file mode 100644 index 0000000..5e718e9 --- /dev/null +++ b/pykmertools/src/lib.rs @@ -0,0 +1,19 @@ +mod cgr; +mod cov; +mod oligo; +use cgr::CgrComputer; +use oligo::OligoComputer; +use pyo3::prelude::*; + +/// Pykmertools: kmertools python wrapper +/// Modules: +/// OligoComputer - computing oligonucleotide frequency vectors +/// from DNA sequences +/// CgrComputer - computing chaos game representations +/// for DNA sequences +#[pymodule] +fn pykmertools(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + Ok(()) +} diff --git a/pykmertools/src/oligo.rs b/pykmertools/src/oligo.rs new file mode 100644 index 0000000..4d9be46 --- /dev/null +++ b/pykmertools/src/oligo.rs @@ -0,0 +1,77 @@ +use kmer::{kmer::KmerGenerator, numeric_to_kmer}; +use pyo3::prelude::*; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; +use std::collections::HashMap; + +/// Computer for generating oligonucleotide frequency vectors +#[pyclass] +pub struct OligoComputer { + ksize: usize, + kcount: usize, + pos_map: Vec, + pos_kmer: HashMap, +} + +#[pymethods] +impl OligoComputer { + /// Initialise the kmer counter + /// Attributes: + /// ksize (int): size of the k-mers to count + #[new] + #[pyo3(signature = (ksize))] + fn new(ksize: usize) -> Self { + let (min_mer_pos_map, pos_min_mer_map, kcount) = KmerGenerator::kmer_pos_maps(ksize); + + Self { + ksize, + kcount, + pos_map: min_mer_pos_map, + pos_kmer: pos_min_mer_map, + } + } + + /// Generate the oligo nucletide vector + /// Attributes: + /// seq (str): sequence as a string + /// norm (bool): enable normalisation by counts + #[pyo3(signature = (seq, norm=true))] + fn vectorise_one(&self, seq: String, norm: bool) -> Vec { + let mut vec = vec![0_f64; self.kcount]; + let mut total = 0_f64; + + for (fmer, rmer) in KmerGenerator::new(seq.as_bytes(), self.ksize) { + let min_mer = u64::min(fmer, rmer); + unsafe { + // we already know the size of the vector and + // min_mer is absolutely smaller than that + let &min_mer_pos = self.pos_map.get_unchecked(min_mer as usize); + *vec.get_unchecked_mut(min_mer_pos) += 1_f64; + total += 1_f64; + } + } + if norm { + vec.iter_mut().for_each(|el| *el /= f64::max(1_f64, total)); + } + vec + } + + /// Generate the oligo nucletide vector + /// Attributes: + /// seq (list[str]): list of sequences + /// norm (bool): enable normalisation by counts + #[pyo3(signature = (seqs, norm=true))] + fn vectorise_batch(&self, seqs: Vec, norm: bool) -> Vec> { + seqs.into_par_iter() + .map(|seq| self.vectorise_one(seq, norm)) + .collect() + } + + /// Generate the header for oligo nucletide vector + fn get_header(&self) -> Vec { + let mut kmers = vec![String::new(); self.kcount]; + for (&pos, &kmer) in self.pos_kmer.iter() { + kmers[pos] = numeric_to_kmer(kmer, self.ksize); + } + kmers + } +} diff --git a/tarpaulin.toml b/tarpaulin.toml new file mode 100644 index 0000000..5b16f57 --- /dev/null +++ b/tarpaulin.toml @@ -0,0 +1,2 @@ +[tarpaulin] +exclude = ["pykmertools"]