-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #40 from sanger-tol/kmer_count
Kmer count + Dimensionality reduction
- Loading branch information
Showing
24 changed files
with
954 additions
and
133 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
kmer_size: 7 | ||
n_neighbors_setting: 13 | ||
autoencoder_epochs_count: -1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
#!/usr/bin/env python3 | ||
""" | ||
Script for counting kmer frequencies per sequence in a FASTA file | ||
Output (STDOUT): kmer counts as a CSV table | ||
Developed by Eerik Aunin ([email protected]) | ||
""" | ||
|
||
import argparse | ||
import general_purpose_functions as gpf | ||
import kcounter | ||
from collections import OrderedDict | ||
import pandas as pd | ||
|
||
|
||
def main(fasta_path, out_path, kmer_size): | ||
fasta_data = gpf.read_fasta_in_chunks(fasta_path) | ||
nucleotides_collection = list() | ||
for header, seq in fasta_data: | ||
seq = seq.upper() | ||
seq_len = len(seq) | ||
nucleotides_dict = kcounter.count_kmers(seq, kmer_size, canonical_kmers=True) | ||
relative_counts_dict = OrderedDict() | ||
relative_counts_dict["header"] = header | ||
relative_counts_dict["seq_len"] = seq_len | ||
for kmer in nucleotides_dict: | ||
kmer_relative_count = nucleotides_dict[kmer] / seq_len | ||
relative_counts_dict[kmer] = kmer_relative_count | ||
nucleotides_collection.append(relative_counts_dict) | ||
df = pd.DataFrame(nucleotides_collection) | ||
df = df.fillna(0) | ||
df.to_csv(out_path, index=False) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description=__doc__) | ||
parser.add_argument("-v", "--version", action="version", version="1.0") | ||
parser.add_argument("fasta_path", type=str, help="Path to input FASTA file") | ||
parser.add_argument("out_path", type=str, help="Path for output CSV file") | ||
parser.add_argument("--kmer_size", type=int, help="kmer size (bp). Default: 7", default=7) | ||
args = parser.parse_args() | ||
main(args.fasta_path, args.out_path, args.kmer_size) |
Oops, something went wrong.