Skip to content

Commit

Permalink
feat(IPVC-2386): add comment example, bug fix, and remove try/except …
Browse files Browse the repository at this point in the history
…added during trouble shooting
  • Loading branch information
bsgiles73 committed May 6, 2024
1 parent b95b205 commit 772075e
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 25 deletions.
9 changes: 0 additions & 9 deletions etc/ncbi-files.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,6 @@
# │ └── GCF_000001405.25_GRCh37.p13_genomic.gff.gz
# └── refseq
# └── H_sapiens
# ├── historical
# │ └── GRCh38
# │ └── GCF_000001405.40-RS_2023_03_historical
# │ ├── GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz
# │ └── GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz
# └── mRNA_Prot
# ├── human.1.protein.faa.gz
# ├── human.1.rna.fna.gz
Expand All @@ -36,10 +31,6 @@ refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz
refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz
refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz

## Historical RefSeq alignments
refseq/H_sapiens/historical/GRCh38/GCF_000001405.40-RS_2023_03_historical/GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz
refseq/H_sapiens/historical/GRCh38/GCF_000001405.40-RS_2023_03_historical/GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz

## Genome build and alignment data
# Build 37
genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_assembly_report.txt
Expand Down
4 changes: 2 additions & 2 deletions misc/refseq-historical-backfill/ncbi_extract_gbff.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,10 +169,10 @@ def main(gbff_files: Iterable, origin: str, prefix: str, output_dir: str) -> Non
)
)
except SeqRecordFeatureError as e:
logger.error(f"SeqRecordFeatureError processing {sr.id}: {e}")
logger.error(f"SeqRecordFeatureError processing {r.id}: {e}")
raise
except ValueError as e:
logger.error(f"ValueError processing {sr.id}: {e}")
logger.error(f"ValueError processing {r.id}: {e}")
raise


Expand Down
25 changes: 11 additions & 14 deletions sbin/ncbi-parse-gbff
Original file line number Diff line number Diff line change
Expand Up @@ -111,20 +111,17 @@ if __name__ == "__main__":
if srf.id.partition("_")[0] not in ["NM", "NR"]:
skipped_ids.add(srf.id)
continue
try:
ti = TxInfo(
ac=srf.id,
origin=opts.origin,
gene_id=srf.gene_id,
gene_symbol=srf.gene_symbol,
cds_se_i=TxInfo.serialize_cds_se_i(srf.cds_se_i),
exons_se_i=TxInfo.serialize_exons_se_i(srf.exons_se_i),
transl_except=TxInfo.serialize_transl_except(srf.transl_except),
)
tiw.write(ti)
genes.add(srf.gene_symbol)
except SeqRecordFeatureError as e:
logger.error("skipping {id}: {msg}".format(id=srf.id, msg=e))
ti = TxInfo(
ac=srf.id,
origin=opts.origin,
gene_id=srf.gene_id,
gene_symbol=srf.gene_symbol,
cds_se_i=TxInfo.serialize_cds_se_i(srf.cds_se_i),
exons_se_i=TxInfo.serialize_exons_se_i(srf.exons_se_i),
transl_except=TxInfo.serialize_transl_except(srf.transl_except),
)
tiw.write(ti)
genes.add(srf.gene_symbol)
logger.info("{ng} genes in {fn} ({c})".format(ng=len(genes), fn=fn, c=prefixes))
total_genes ^= genes
all_prefixes += prefixes
Expand Down
18 changes: 18 additions & 0 deletions src/uta/parsers/seqrecord.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,24 @@ def cds_feature(self) -> Optional[SeqFeature]:
Some NCBI records will contain multiple CDS features. In these one CDS describes a protein
with accession and protein sequence, they other CDS features describes a pseudogene. This method
will preferentially choose the CDS feature with a protein sequence.
Example:
CDS 422..778
/gene="C6orf119"
/gene_synonym="dJ427A4.2"
/codon_start=1
/product="chromosome 6 open reading frame 119"
/protein_id="NP_001012240.1"
/db_xref="GI:59276067"
/db_xref="GeneID:353267"
/translation="MTDTAEAVPNFEEMFASRFTENDKEYQEYLKRPPESPPIVEEWN
SRAGGNQRNRGNRLQDNRQFRGRDNRWGWPSDNRSNQWHGRSWGNNYPQHRQEPYYPQ
QYGHYGYNQRPPYGYY"
CDS 422..775
/locus_tag="RP3-427A4.2-001"
/note="match: proteins: Q9BTL3 Q9CQY2 Q9CWI1"
/pseudo
/codon_start=1
/product="Novel pseudogene"
"""
cds_features = self.features_by_type.get("CDS")
if cds_features is None:
Expand Down

0 comments on commit 772075e

Please sign in to comment.