Skip to content

Commit

Permalink
sort complex files
Browse files Browse the repository at this point in the history
  • Loading branch information
tanghaibao committed Apr 30, 2024
1 parent 7adeeb9 commit 456dbe5
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 74 deletions.
93 changes: 44 additions & 49 deletions jcvi/formats/fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@

import re
import sys
import os
import os.path as op
import shutil
import logging
import string
import hashlib

from random import choice

from itertools import groupby, zip_longest
from more_itertools import grouper, pairwise

Expand All @@ -19,12 +20,13 @@
from Bio.SeqRecord import SeqRecord
from Bio.SeqUtils.CheckSum import seguid

from jcvi.formats.base import BaseFile, DictFile, must_open
from jcvi.formats.bed import Bed
from jcvi.utils.cbook import percentage
from jcvi.utils.console import printf
from jcvi.utils.table import write_csv
from jcvi.apps.base import OptionParser, ActionDispatcher, cleanup, need_update
from ..apps.base import OptionParser, ActionDispatcher, cleanup, need_update
from ..utils.cbook import percentage
from ..utils.console import printf
from ..utils.table import write_csv

from .base import BaseFile, DictFile, must_open
from .bed import Bed


class Fasta(BaseFile, dict):
Expand Down Expand Up @@ -166,19 +168,17 @@ def sequence(self, f, asstring=True):
return seq


"""
Class derived from https://gist.github.com/933737
Original code written by David Winter (https://github.com/dwinter)
Code writted to answer this challenge at Biostar:
http://biostar.stackexchange.com/questions/5902/
class ORFFinder(object):
"""
Class derived from https://gist.github.com/933737
Original code written by David Winter (https://github.com/dwinter)
(Code includes improvements from Brad Chapman)
"""
Code writted to answer this challenge at Biostar:
http://biostar.stackexchange.com/questions/5902/
(Code includes improvements from Brad Chapman)
class ORFFinder(object):
"""Find the longest ORF in a given sequence
Find the longest ORF in a given sequence
"seq" is a string, if "start" is not provided any codon can be the start of
and ORF. If muliple ORFs have the longest length the first one encountered
is printed
Expand Down Expand Up @@ -356,51 +356,48 @@ def rc(s):
def main():

actions = (
("clean", "remove irregular chars in FASTA seqs"),
("diff", "check if two fasta records contain same information"),
(
"extract",
"given fasta file and seq id, retrieve the sequence " + "in fasta format",
"given fasta file and seq id, retrieve the sequence in fasta format",
),
("longestorf", "find longest orf for CDS fasta"),
("translate", "translate CDS to proteins"),
("info", "run `sequence_info` on fasta files"),
("summary", "report the real no of bases and N's in fasta files"),
("uniq", "remove records that are the same"),
("ids", "generate a list of headers"),
("fastq", "combine fasta and qual to create fastq file"),
(
"format",
"trim accession id to the first space or switch id "
+ "based on 2-column mapping file",
"trim accession id to the first space or switch id based on 2-column mapping file",
),
("pool", "pool a bunch of fastafiles together and add prefix"),
("random", "randomly take some records"),
("simulate", "simulate random fasta file for testing"),
("diff", "check if two fasta records contain same information"),
("identical", "given 2 fasta files, find all exactly identical records"),
("trim", "given a cross_match screened fasta, trim the sequence"),
("trimsplit", "split sequences at lower-cased letters"),
("sort", "sort the records by IDs, sizes, etc."),
("filter", "filter the records by size"),
("fromtab", "convert 2-column sequence file to FASTA format"),
("gaps", "print out a list of gap sizes within sequences"),
("gc", "plot G+C content distribution"),
("identical", "given 2 fasta files, find all exactly identical records"),
("ids", "generate a list of headers"),
("info", "run `sequence_info` on fasta files"),
("ispcr", "reformat paired primers into isPcr query format"),
("join", "concatenate a list of seqs and add gaps in between"),
("longestorf", "find longest orf for CDS fasta"),
("pair", "sort paired reads to .pairs, rest to .fragments"),
(
"pairinplace",
"starting from fragment.fasta, find if "
+ "adjacent records can form pairs",
"starting from fragment.fasta, find if adjacent records can form pairs",
),
("fastq", "combine fasta and qual to create fastq file"),
("tidy", "normalize gap sizes and remove small components in fasta"),
("pool", "pool a bunch of fastafiles together and add prefix"),
("qual", "generate dummy .qual file based on FASTA file"),
("random", "randomly take some records"),
("sequin", "generate a gapped fasta file for sequin submission"),
("gaps", "print out a list of gap sizes within sequences"),
("join", "concatenate a list of seqs and add gaps in between"),
("simulate", "simulate random fasta file for testing"),
(
"some",
"include or exclude a list of records (also performs on "
+ ".qual file if available)",
"include or exclude a list of records (also performs on .qual file if available)",
),
("qual", "generate dummy .qual file based on FASTA file"),
("clean", "remove irregular chars in FASTA seqs"),
("ispcr", "reformat paired primers into isPcr query format"),
("fromtab", "convert 2-column sequence file to FASTA format"),
("gc", "plot G+C content distribution"),
("sort", "sort the records by IDs, sizes, etc."),
("summary", "report the real no of bases and N's in fasta files"),
("tidy", "normalize gap sizes and remove small components in fasta"),
("translate", "translate CDS to proteins"),
("trim", "given a cross_match screened fasta, trim the sequence"),
("trimsplit", "split sequences at lower-cased letters"),
("uniq", "remove records that are the same"),
)
p = ActionDispatcher(actions)
p.dispatch(globals())
Expand All @@ -410,8 +407,6 @@ def simulate_one(fw, name, size):
"""
Simulate a random sequence with name and size
"""
from random import choice

seq = Seq("".join(choice("ACGT") for _ in range(size)))
s = SeqRecord(seq, id=name, description="Fake sequence")
SeqIO.write([s], fw, "fasta")
Expand Down
48 changes: 23 additions & 25 deletions jcvi/formats/gff.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
)
from ..annotation.reformat import atg_name
from ..utils.cbook import AutoVivification
from ..utils.range import range_minmax
from ..utils.range import Range, range_minmax
from ..utils.orderedcollections import DefaultOrderedDict, OrderedDict, parse_qs

from .base import DictFile, LineFile, must_open, is_number
Expand Down Expand Up @@ -484,8 +484,6 @@ def to_range(obj, score=None, id=None, strand=None):
"""
Given a gffutils object, convert it to a range object
"""
from jcvi.utils.range import Range

if score or id:
_score = score if score else obj.score
_id = id if id else obj.id
Expand All @@ -503,39 +501,39 @@ def main():
("addparent", "merge sister features and infer their parent"),
("bed", "parse gff and produce bed file for particular feature type"),
("bed12", "produce bed12 file for coding features"),
("fromgtf", "convert gtf to gff3 format"),
("gtf", "convert gff3 to gtf format"),
("gb", "convert gff3 to genbank format"),
("sort", "sort the gff file"),
("chain", "fill in parent features by chaining children"),
("children", "find all children that belongs to the same parent"),
("cluster", "cluster transcripts based on shared splicing structure"),
("extract", "extract contig or features from gff file"),
("filter", "filter the gff file based on Identity and Coverage"),
("sizes", "calculate sizes of features in gff file"),
("format", "format the gff file, change seqid, etc."),
(
"fixboundaries",
"fix boundaries of parent features by range chaining child features",
),
("chain", "fill in parent features by chaining children"),
("rename", "change the IDs within the gff3"),
("uniq", "remove the redundant gene models"),
("liftover", "adjust gff coordinates based on tile number"),
("note", "extract certain attribute field for each feature"),
("load", "extract the feature (e.g. CDS) sequences and concatenate"),
("extract", "extract contig or features from gff file"),
("split", "split the gff into one contig per file"),
("merge", "merge several gff files into one"),
("parents", "find the parents given a list of IDs"),
("children", "find all children that belongs to the same parent"),
(
"fixpartials",
"fix 5/3 prime partial transcripts, locate nearest in-frame start/stop",
),
("format", "format the gff file, change seqid, etc."),
("frombed", "convert from bed format to gff3"),
("fromgtf", "convert gtf to gff3 format"),
("fromsoap", "convert from soap format to gff3"),
("gapsplit", "split alignment GFF3 at gaps based on CIGAR string"),
("gb", "convert gff3 to genbank format"),
("gtf", "convert gff3 to gtf format"),
("liftover", "adjust gff coordinates based on tile number"),
("load", "extract the feature (e.g. CDS) sequences and concatenate"),
("merge", "merge several gff files into one"),
("note", "extract certain attribute field for each feature"),
("orient", "orient the coding features based on translation"),
("parents", "find the parents given a list of IDs"),
("rename", "change the IDs within the gff3"),
("sizes", "calculate sizes of features in gff file"),
("sort", "sort the gff file"),
("splicecov", "tag gff introns with coverage info from junctions.bed"),
("split", "split the gff into one contig per file"),
("summary", "print summary stats for features of different types"),
("cluster", "cluster transcripts based on shared splicing structure"),
(
"fixpartials",
"fix 5/3 prime partial transcripts, locate nearest in-frame start/stop",
),
("uniq", "remove the redundant gene models"),
)

p = ActionDispatcher(actions)
Expand Down

0 comments on commit 456dbe5

Please sign in to comment.