feat(IPVC-2386): add comment example, bug fix, and remove try/except …

…added during trouble shooting
invitae · May 6, 2024 · 772075e · 772075e
1 parent b95b205
commit 772075e
Show file tree

Hide file tree

Showing 4 changed files with 31 additions and 25 deletions.
diff --git a/etc/ncbi-files.txt b/etc/ncbi-files.txt
@@ -16,11 +16,6 @@
 #    │                       └── GCF_000001405.25_GRCh37.p13_genomic.gff.gz
 #    └── refseq
 #        └── H_sapiens
-#            ├── historical
-#            │   └── GRCh38
-#            │       └── GCF_000001405.40-RS_2023_03_historical
-#            │           ├── GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz
-#            │           └── GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz
 #            └── mRNA_Prot
 #                ├── human.1.protein.faa.gz
 #                ├── human.1.rna.fna.gz
@@ -36,10 +31,6 @@ refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz
 refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz
 refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz
 
-## Historical RefSeq alignments
-refseq/H_sapiens/historical/GRCh38/GCF_000001405.40-RS_2023_03_historical/GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz
-refseq/H_sapiens/historical/GRCh38/GCF_000001405.40-RS_2023_03_historical/GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz
-
 ## Genome build and alignment data
 # Build 37
 genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_assembly_report.txt

diff --git a/misc/refseq-historical-backfill/ncbi_extract_gbff.py b/misc/refseq-historical-backfill/ncbi_extract_gbff.py
@@ -169,10 +169,10 @@ def main(gbff_files: Iterable, origin: str, prefix: str, output_dir: str) -> Non
                             )
                         )
                 except SeqRecordFeatureError as e:
-                    logger.error(f"SeqRecordFeatureError processing {sr.id}: {e}")
+                    logger.error(f"SeqRecordFeatureError processing {r.id}: {e}")
                     raise
                 except ValueError as e:
-                    logger.error(f"ValueError processing {sr.id}: {e}")
+                    logger.error(f"ValueError processing {r.id}: {e}")
                     raise
 
 

diff --git a/sbin/ncbi-parse-gbff b/sbin/ncbi-parse-gbff
@@ -111,20 +111,17 @@ if __name__ == "__main__":
             if srf.id.partition("_")[0] not in ["NM", "NR"]:
                 skipped_ids.add(srf.id)
                 continue
-            try:
-                ti = TxInfo(
-                    ac=srf.id,
-                    origin=opts.origin,
-                    gene_id=srf.gene_id,
-                    gene_symbol=srf.gene_symbol,
-                    cds_se_i=TxInfo.serialize_cds_se_i(srf.cds_se_i),
-                    exons_se_i=TxInfo.serialize_exons_se_i(srf.exons_se_i),
-                    transl_except=TxInfo.serialize_transl_except(srf.transl_except),
-                )
-                tiw.write(ti)
-                genes.add(srf.gene_symbol)
-            except SeqRecordFeatureError as e:
-                logger.error("skipping {id}: {msg}".format(id=srf.id, msg=e))
+            ti = TxInfo(
+                ac=srf.id,
+                origin=opts.origin,
+                gene_id=srf.gene_id,
+                gene_symbol=srf.gene_symbol,
+                cds_se_i=TxInfo.serialize_cds_se_i(srf.cds_se_i),
+                exons_se_i=TxInfo.serialize_exons_se_i(srf.exons_se_i),
+                transl_except=TxInfo.serialize_transl_except(srf.transl_except),
+            )
+            tiw.write(ti)
+            genes.add(srf.gene_symbol)
         logger.info("{ng} genes in {fn} ({c})".format(ng=len(genes), fn=fn, c=prefixes))
         total_genes ^= genes
         all_prefixes += prefixes

diff --git a/src/uta/parsers/seqrecord.py b/src/uta/parsers/seqrecord.py
@@ -28,6 +28,24 @@ def cds_feature(self) -> Optional[SeqFeature]:
         Some NCBI records will contain multiple CDS features. In these one CDS describes a protein
         with accession and protein sequence, they other CDS features describes a pseudogene. This method
         will preferentially choose the CDS feature with a protein sequence.
+        Example:
+                 CDS             422..778
+                                 /gene="C6orf119"
+                                 /gene_synonym="dJ427A4.2"
+                                 /codon_start=1
+                                 /product="chromosome 6 open reading frame 119"
+                                 /protein_id="NP_001012240.1"
+                                 /db_xref="GI:59276067"
+                                 /db_xref="GeneID:353267"
+                                 /translation="MTDTAEAVPNFEEMFASRFTENDKEYQEYLKRPPESPPIVEEWN
+                                 SRAGGNQRNRGNRLQDNRQFRGRDNRWGWPSDNRSNQWHGRSWGNNYPQHRQEPYYPQ
+                                 QYGHYGYNQRPPYGYY"
+                 CDS             422..775
+                                 /locus_tag="RP3-427A4.2-001"
+                                 /note="match: proteins: Q9BTL3 Q9CQY2 Q9CWI1"
+                                 /pseudo
+                                 /codon_start=1
+                                 /product="Novel pseudogene"
         """
         cds_features = self.features_by_type.get("CDS")
         if cds_features is None: