Skip to content

Commit

Permalink
Merge pull request #112 from socialgene/gff
Browse files Browse the repository at this point in the history
Changes protein hashing
  • Loading branch information
chasemc authored Oct 20, 2024
2 parents c79035f + 9bc7d39 commit 1981e3a
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 2 deletions.
5 changes: 5 additions & 0 deletions socialgene/hashing/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ def sha512t24u(input):

def hash_aminos(input, **kwargs):
# make sure everything is uppercase before hashing
cleaned = input.upper()
# remove all whitespace
cleaned = "".join(cleaned.split())
# remove "*" if it's at the beginning or end of the string but not in the middle
cleaned = cleaned.strip("*")
return hasher(input=input.upper(), **kwargs)


Expand Down
5 changes: 3 additions & 2 deletions socialgene/parsers/gff.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from Bio import SeqIO

import socialgene.utils.file_handling as fh
from socialgene.utils.logging import log


class GFFParserMixin:
Expand All @@ -29,10 +30,10 @@ def _fasta_biopython(self, input_path: str):
# read the remaining lines into fasta dictionary using biopython
fasta_dict = SeqIO.to_dict(SeqIO.parse(file, "fasta"))
if not fasta_dict:
raise ValueError("No sequences found in FASTA section")
log.warning("No sequences found in FASTA section")
return fasta_dict

def parse_gff_file(self, input_path: str, keep_sequence: bool = True):
def parse_gff_file(self, input_path: str, keep_sequence: bool = True, **kwargs):
# name of file without extension
assembly_uid = Path(input_path).name
assembly_uid = assembly_uid.split(".gff")[0]
Expand Down

0 comments on commit 1981e3a

Please sign in to comment.