Merge pull request #112 from socialgene/gff

Changes protein hashing
socialgene · Oct 20, 2024 · 1981e3a · 1981e3a
2 parents c79035f + 9bc7d39
commit 1981e3a
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 2 deletions.
diff --git a/socialgene/hashing/hashing.py b/socialgene/hashing/hashing.py
@@ -24,6 +24,11 @@ def sha512t24u(input):
 
 def hash_aminos(input, **kwargs):
     # make sure everything is uppercase before hashing
+    cleaned = input.upper()
+    # remove all whitespace
+    cleaned = "".join(cleaned.split())
+    # remove "*" if it's at the beginning or end of the string but not in the middle
+    cleaned = cleaned.strip("*")
     return hasher(input=input.upper(), **kwargs)
 
 

diff --git a/socialgene/parsers/gff.py b/socialgene/parsers/gff.py
@@ -4,6 +4,7 @@
 from Bio import SeqIO
 
 import socialgene.utils.file_handling as fh
+from socialgene.utils.logging import log
 
 
 class GFFParserMixin:
@@ -29,10 +30,10 @@ def _fasta_biopython(self, input_path: str):
             # read the remaining lines into fasta dictionary using biopython
             fasta_dict = SeqIO.to_dict(SeqIO.parse(file, "fasta"))
             if not fasta_dict:
-                raise ValueError("No sequences found in FASTA section")
+                log.warning("No sequences found in FASTA section")
         return fasta_dict
 
-    def parse_gff_file(self, input_path: str, keep_sequence: bool = True):
+    def parse_gff_file(self, input_path: str, keep_sequence: bool = True, **kwargs):
         # name of file without extension
         assembly_uid = Path(input_path).name
         assembly_uid = assembly_uid.split(".gff")[0]