diff --git a/.gitmodules b/.gitmodules index 51c058a..12ddf0a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "miqScoreNGSReadCountPublic"] path = miqScoreNGSReadCountPublic url = https://github.com/Zymo-Research/miqScoreNGSReadCountPublic.git +[submodule "miqScoreShotgunPublicSupport"] + path = miqScoreShotgunPublicSupport + url = https://github.com/Zymo-Research/miqScoreShotgunPublicSupport.git diff --git a/miqScoreShotgunPublicSupport b/miqScoreShotgunPublicSupport new file mode 160000 index 0000000..236e250 --- /dev/null +++ b/miqScoreShotgunPublicSupport @@ -0,0 +1 @@ +Subproject commit 236e250b33824a52c7f3b833ce3241eff1a101b7 diff --git a/miqScoreShotgunPublicSupport/__init__.py b/miqScoreShotgunPublicSupport/__init__.py deleted file mode 100644 index f4a6318..0000000 --- a/miqScoreShotgunPublicSupport/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -__all__ = ["parameters", - "projectData", - "formatReaders", - "reporting", - "alignmentAnalysis"] - -from . import parameters -from . import projectData -from . import formatReaders -from . import reporting -from . import alignmentAnalysis diff --git a/miqScoreShotgunPublicSupport/alignmentAnalysis/__init__.py b/miqScoreShotgunPublicSupport/alignmentAnalysis/__init__.py deleted file mode 100644 index 065eb84..0000000 --- a/miqScoreShotgunPublicSupport/alignmentAnalysis/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -__all__ = ["alignmentAnalysisPE", - "alignmentAnalysisSE", - "bwaHandler", - "minimap2"] - -from . import alignmentAnalysisPE -from . import alignmentAnalysisSE -from . import bwaHandler -from . import minimap2 \ No newline at end of file diff --git a/miqScoreShotgunPublicSupport/alignmentAnalysis/alignmentAnalysisPE.py b/miqScoreShotgunPublicSupport/alignmentAnalysis/alignmentAnalysisPE.py deleted file mode 100644 index ebe02cb..0000000 --- a/miqScoreShotgunPublicSupport/alignmentAnalysis/alignmentAnalysisPE.py +++ /dev/null @@ -1,301 +0,0 @@ -import os -import pysam -import typing - -''' -Read species calls: -Chimera_like: Both paired ends disagree, no good single-species explanation -Poor_quality: Depending on if call was being made on single or both ends, quality of read(s) for making call was insufficient -Unmapped_reads: No usable alignment data -Ambiguous_reads: Both reads support two or more species without a big enough delta in confidence to be decisive -If a species is given, that was a successful call -Under certain parameter settings, a tuple of multiple species may be given indicating reads that support multiple species -''' - - -class ReadAlignmentData(object): - __slots__ = ["qname", - "mapq", - "secondaryAlignment", - "isRead2", - "species"] - - def __init__(self, contig:str, mapq:int, qname:str, secondaryAlignment:bool, isRead2:bool): - self.species = extractSpecies(contig) - self.mapq = mapq - self.qname = qname - self.secondaryAlignment = secondaryAlignment - self.isRead2 = isRead2 - - def __str__(self): - secondary = "" - read = "R1" - if self.isRead2: - read = "R2" - if self.secondaryAlignment: - secondary = ", Secondary" - return "%s, %s, %s%s, %s " %(self.species, self.mapq, read, secondary, self.qname) - - -class ReadPairData(object): - __slots__ = ["forwardRead", - "reverseRead", - "speciesConflict", - "secondaryAlignment", - "mapped", - "bothMapped", - "calledSpecies", - "poorQualityDrop"] - - def __init__(self, read1:ReadAlignmentData, read2:ReadAlignmentData): - forwardRead = None - reverseRead = None - for read in [read1, read2]: - if read1.isRead2: - reverseRead = read1 - else: - forwardRead = read1 - if read2.isRead2: - reverseRead = read2 - else: - forwardRead = read2 - if not forwardRead and reverseRead: - raise MispairedReadError("Two reads of the same direction were given: %s and %s" %(read1, read2)) - self.forwardRead = forwardRead - self.reverseRead = reverseRead - self.speciesConflict = self.checkSpeciesConflict() - self.secondaryAlignment = self.checkSecondaryAlignment() - - def checkSpeciesConflict(self, readPairDisputeDecisionThresholdQDelta:int=40, minimumMapqForSingleReadConfidence:int=40, minimumMapqForPairedReadConfidence:int=30): - if bool(self.forwardRead.species) != bool(self.reverseRead.species): #Functionally a logical XOR, handles cases where one read was unmappable - self.mapped = True - self.bothMapped = False - if self.forwardRead.species: - if self.forwardRead.mapq < minimumMapqForSingleReadConfidence: - self.mapped = False - self.bothMapped = False - self.calledSpecies = None - self.poorQualityDrop = True - return False - self.calledSpecies = (self.forwardRead.species) - else: - if self.reverseRead.mapq < minimumMapqForSingleReadConfidence: - self.mapped = False - self.bothMapped = False - self.calledSpecies = None - self.poorQualityDrop = True - return False - self.calledSpecies = (self.reverseRead.species) - return False - if not self.forwardRead.species and not self.reverseRead.species: - self.mapped = False - self.bothMapped = False - else: - self.mapped = True - self.bothMapped = True - if self.forwardRead.species == self.reverseRead.species: - if self.forwardRead.mapq < minimumMapqForPairedReadConfidence or self.reverseRead.mapq < minimumMapqForPairedReadConfidence: - self.calledSpecies = None - self.poorQualityDrop = True - else: - self.poorQualityDrop = False - self.calledSpecies = (self.forwardRead.species) - return False - else: - mapqs = (self.forwardRead.mapq, self.reverseRead.mapq) - mapqDelta = abs(mapqs[0] - mapqs[1]) - if max(mapqs) < minimumMapqForSingleReadConfidence: - self.calledSpecies = None - self.poorQualityDrop = True - if mapqDelta >= readPairDisputeDecisionThresholdQDelta: - if mapqs[0] > mapqs[1]: - self.calledSpecies = self.forwardRead.species - else: - self.calledSpecies = self.reverseRead.species - self.poorQualityDrop = False - return False - self.calledSpecies = (self.forwardRead.species, self.reverseRead.species) - self.poorQualityDrop = False - return True - - def checkSecondaryAlignment(self): - return self.forwardRead.secondaryAlignment or self.reverseRead.secondaryAlignment - - -class MispairedReadError(Exception): - pass - - -def extractSpecies(referenceName:str): - if referenceName is None: - return None - return "_".join(referenceName.split("_")[:2]) - - -def readParallelProcessor(read:pysam.AlignedRead): - return ReadAlignmentData(read.reference_name, read.mapping_quality, read.query_name, read.is_secondary, read.is_read2) - - -def listBamFiles(folder:str): - folderFiles = os.listdir(folder) - bamFiles = [os.path.join(folder, file) for file in folderFiles if file.endswith(".bam")] - return bamFiles - - -def generateAnalyzedReadList(bamFilePath:str): - import datetime - startTime = datetime.datetime.now() - bamFile = pysam.AlignmentFile(bamFilePath, "rb") - analyzedReads = [] - readCount = 0 - for read in bamFile: - analyzedReads.append(ReadAlignmentData(read.reference_name, read.mapping_quality, read.query_name, read.is_secondary, read.is_read2)) - readCount += 1 - if readCount % 500000 == 0: - analysisTime = datetime.datetime.now() - startTime - print("Analyzed %s reads in %s" %(readCount, analysisTime), flush=True) - bamFile.close() - analysisTime = datetime.datetime.now() - startTime - print("Analyzed %s reads in %s" % (readCount, analysisTime)) - return analyzedReads - - -def readSorter(readList:typing.List[ReadAlignmentData]): - sortedReads = {} - readList.insert(0, None) - read = readList.pop() - while read is not None: - if not read.qname in sortedReads: - sortedReads[read.qname] = [] - sortedReads[read.qname].append(read) - read = readList.pop() - return sortedReads - - -def sortedReadsDictToList(readDict:dict): - readList = [] - qnames = list(readDict.keys()) - qnames.insert(0, None) - qname = qnames.pop() - while qname: - readList.append(readDict[qname]) - del readDict[qname] - qname = qnames.pop() - return readList - - -def getSpeciesCallCounts(readSetList:typing.List[typing.List[ReadAlignmentData]]): - import collections - speciesCalls = [callSpeciesFromReadSet(readSet) for readSet in readSetList] - speciesCounts = collections.Counter(speciesCalls) - return speciesCounts - - -def callSpeciesFromReadSet(readList:typing.List[ReadAlignmentData], reportSpeciesConflictListAsChimeraLike:bool=True): - if len(readList) == 2: - readPair = ReadPairData(*readList) - if readPair.speciesConflict: - if reportSpeciesConflictListAsChimeraLike: - return "Chimera_like" - else: - return readPair.calledSpecies - if not readPair.mapped: - return "Unaligned_reads" - if readPair.poorQualityDrop: - return "Poor_quality" - if not readPair.calledSpecies: - return "Unaligned_reads" - return readPair.calledSpecies - else: - return multimapDisputeResolution(readList) - - -def splitForwardAndReverse(readList:typing.List[ReadAlignmentData], removeUnaligned:bool=True): - forward = [] - reverse = [] - if removeUnaligned: - forward = [read for read in readList if not read.isRead2 and read.species] - reverse = [read for read in readList if read.isRead2 and read.species] - else: - forward = [read for read in readList if not read.isRead2] - reverse = [read for read in readList if read.isRead2] - return forward, reverse - - -def getConfidentReads(readList:typing.List[ReadAlignmentData], multimapDisputeResolutionQDelta:int=20): - if len(readList) == 1: - if readList[0].species: - return set([readList[0]]) - else: #Handles a case where one of the sets came back as unaligned - return set() - else: - mapqs = [read.mapq for read in readList] - topMapq = max(mapqs) - confidentReads = [read for read in readList if topMapq - read.mapq <= multimapDisputeResolutionQDelta] - return set(confidentReads) - - -def getConfidentSpecies(readList:typing.List[ReadAlignmentData], multimapDisputeResolutionQDelta:int=20): - if len(readList) == 1: - if readList[0].species: - return set([readList[0]]) - else: #Handles a case where one of the sets came back as unaligned - return set() - else: - mapqs = [read.mapq for read in readList] - topMapq = max(mapqs) - confidentSpecies = [read.species for read in readList if topMapq - read.mapq <= multimapDisputeResolutionQDelta] - return set(confidentSpecies) - - -def getMaxMapqFromReadList(readList:typing.List[ReadAlignmentData]): - return max([read.mapq for read in readList]) - - -def multimapDisputeResolution(readList:typing.List[ReadAlignmentData], minimumMapqForSingleReadConfidence:int=40, minimumMapqForPairedReadConfidence:int=30): - forwardReads, reverseReads = splitForwardAndReverse(readList, removeUnaligned=True) - if not reverseReads: - forwardMapqMax = getMaxMapqFromReadList(forwardReads) - if not forwardMapqMax >= minimumMapqForSingleReadConfidence: - return "Poor_quality" - confidentForwards = getConfidentReads(forwardReads) - if len(confidentForwards) == 1: - confidentForwards = list(confidentForwards) - return confidentForwards[0].species - else: - return "Ambiguous_reads" - if not forwardReads: - reverseMapqMax = getMaxMapqFromReadList(reverseReads) - if not reverseMapqMax >= minimumMapqForSingleReadConfidence: - return "Poor_quality" - confidentReverses = getConfidentReads(reverseReads) - if len(confidentReverses) == 1: - confidentReverses = list(confidentReverses) - return confidentReverses[0].species - else: - return "Ambiguous_reads" - forwardMapqMax = getMaxMapqFromReadList(forwardReads) - reverseMapqMax = getMaxMapqFromReadList(reverseReads) - if not forwardMapqMax >= minimumMapqForPairedReadConfidence or reverseMapqMax >= minimumMapqForPairedReadConfidence: - return "Poor_quality" - forwardConfidentSpecies = getConfidentSpecies(forwardReads) - reverseConfidentSpecies = getConfidentSpecies(reverseReads) - speciesIntersection = forwardConfidentSpecies.intersection(reverseConfidentSpecies) - if len(speciesIntersection) == 0: - return "Chimera_like" - elif len(speciesIntersection) == 1: - speciesIntersection = list(speciesIntersection) - return speciesIntersection[0] - else: - return "Ambiguous_reads" - - -def bamFileProcessor(bamFile:str): - print("Starting analysis of %s" %bamFile, flush=True) - readList = generateAnalyzedReadList(bamFile) - sortedReads = readSorter(readList) - del readList - readSets = sortedReadsDictToList(sortedReads) - del sortedReads - speciesCallCounts = getSpeciesCallCounts(readSets) - return speciesCallCounts \ No newline at end of file diff --git a/miqScoreShotgunPublicSupport/alignmentAnalysis/alignmentAnalysisSE.py b/miqScoreShotgunPublicSupport/alignmentAnalysis/alignmentAnalysisSE.py deleted file mode 100644 index f8682be..0000000 --- a/miqScoreShotgunPublicSupport/alignmentAnalysis/alignmentAnalysisSE.py +++ /dev/null @@ -1,173 +0,0 @@ -import os -import pysam -import typing - -''' -Read species calls: -Chimera_like: Both paired ends disagree, no good single-species explanation -Poor_quality: Depending on if call was being made on single or both ends, quality of read(s) for making call was insufficient -Unmapped_reads: No usable alignment data -Ambiguous_reads: Both reads support two or more species without a big enough delta in confidence to be decisive -If a species is given, that was a successful call -Under certain parameter settings, a tuple of multiple species may be given indicating reads that support multiple species -''' - - -class ReadAlignmentData(object): - __slots__ = ["qname", - "mapq", - "secondaryAlignment", - "isRead2", - "species"] - - def __init__(self, contig:str, mapq:int, qname:str, secondaryAlignment:bool, isRead2:bool): - self.species = extractSpecies(contig) - self.mapq = mapq - self.qname = qname - self.secondaryAlignment = secondaryAlignment - self.isRead2 = isRead2 - - def __str__(self): - secondary = "" - read = "R1" - if self.isRead2: - read = "R2" - if self.secondaryAlignment: - secondary = ", Secondary" - return "%s, %s, %s%s, %s " %(self.species, self.mapq, read, secondary, self.qname) - - - -class MispairedReadError(Exception): - pass - - -def extractSpecies(referenceName:str): - if referenceName is None: - return None - return "_".join(referenceName.split("_")[:2]) - - -def readParallelProcessor(read:pysam.AlignedRead): - return ReadAlignmentData(read.reference_name, read.mapping_quality, read.query_name, read.is_secondary, read.is_read2) - - -def listBamFiles(folder:str): - folderFiles = os.listdir(folder) - bamFiles = [os.path.join(folder, file) for file in folderFiles if file.endswith(".bam")] - return bamFiles - - -def generateAnalyzedReadList(bamFilePath:str): - import datetime - startTime = datetime.datetime.now() - bamFile = pysam.AlignmentFile(bamFilePath, "rb") - analyzedReads = [] - readCount = 0 - for read in bamFile: - analyzedReads.append(ReadAlignmentData(read.reference_name, read.mapping_quality, read.query_name, read.is_secondary, read.is_read2)) - readCount += 1 - if readCount % 500000 == 0: - analysisTime = datetime.datetime.now() - startTime - print("Analyzed %s reads in %s" %(readCount, analysisTime), flush=True) - bamFile.close() - analysisTime = datetime.datetime.now() - startTime - print("Analyzed %s reads in %s" % (readCount, analysisTime)) - return analyzedReads - - -def readSorter(readList:typing.List[ReadAlignmentData]): - sortedReads = {} - readList.insert(0, None) - read = readList.pop() - while read is not None: - if not read.qname in sortedReads: - sortedReads[read.qname] = [] - sortedReads[read.qname].append(read) - read = readList.pop() - return sortedReads - - -def sortedReadsDictToList(readDict:dict): - readList = [] - qnames = list(readDict.keys()) - qnames.insert(0, None) - qname = qnames.pop() - while qname: - readList.append(readDict[qname]) - del readDict[qname] - qname = qnames.pop() - return readList - - -def getSpeciesCallCounts(readSetList:typing.List[typing.List[ReadAlignmentData]]): - import collections - speciesCalls = [callSpeciesFromReadSet(readSet) for readSet in readSetList] - speciesCounts = collections.Counter(speciesCalls) - return speciesCounts - - -def callSpeciesFromReadSet(readList:typing.List[ReadAlignmentData], minimumMapqForSingleReadConfidence:int=40): - if len(readList) == 1: - if readList[0].species: - if readList[0].mapq >= minimumMapqForSingleReadConfidence: - return readList[0].species - else: - return "Poor_quality" - else: - return "Unaligned_reads" - else: - return multimapDisputeResolution(readList, minimumMapqForSingleReadConfidence) - - -def getConfidentReads(readList:typing.List[ReadAlignmentData], multimapDisputeResolutionQDelta:int=20): - if len(readList) == 1: - if readList[0].species: - return set([readList[0]]) - else: #Handles a case where one of the sets came back as unaligned - return set() - else: - mapqs = [read.mapq for read in readList] - topMapq = max(mapqs) - confidentReads = [read for read in readList if topMapq - read.mapq <= multimapDisputeResolutionQDelta] - return set(confidentReads) - - -def getConfidentSpecies(readList:typing.List[ReadAlignmentData], multimapDisputeResolutionQDelta:int=20): - if len(readList) == 1: - if readList[0].species: - return set([readList[0]]) - else: #Handles a case where one of the sets came back as unaligned - return set() - else: - mapqs = [read.mapq for read in readList] - topMapq = max(mapqs) - confidentSpecies = [read.species for read in readList if topMapq - read.mapq <= multimapDisputeResolutionQDelta] - return set(confidentSpecies) - - -def getMaxMapqFromReadList(readList:typing.List[ReadAlignmentData]): - return max([read.mapq for read in readList]) - - -def multimapDisputeResolution(readList:typing.List[ReadAlignmentData], minimumMapqForSingleReadConfidence:int=40): - mapqMax = getMaxMapqFromReadList(readList) - if not mapqMax >= minimumMapqForSingleReadConfidence: - return "Poor_quality" - confidentReads = getConfidentReads(readList) - if len(confidentReads) == 1: - confidentReads = list(confidentReads) - return confidentReads[0].species - else: - return "Ambiguous_reads" - - -def bamFileProcessor(bamFile:str): - print("Starting analysis of %s" %bamFile, flush=True) - readList = generateAnalyzedReadList(bamFile) - sortedReads = readSorter(readList) - del readList - readSets = sortedReadsDictToList(sortedReads) - del sortedReads - speciesCallCounts = getSpeciesCallCounts(readSets) - return speciesCallCounts \ No newline at end of file diff --git a/miqScoreShotgunPublicSupport/alignmentAnalysis/bwaHandler.py b/miqScoreShotgunPublicSupport/alignmentAnalysis/bwaHandler.py deleted file mode 100644 index cd6ba8d..0000000 --- a/miqScoreShotgunPublicSupport/alignmentAnalysis/bwaHandler.py +++ /dev/null @@ -1,86 +0,0 @@ -def bwaAlignPE(forwardReads:str, reverseReads:str, workingFolder:str, outputBAM:str, refGenome:str, coreLimit:int=None, compressionCoresPercentage:float=0.15, mock:bool=False): - import multiprocessing - import os - availableCores = multiprocessing.cpu_count() - 1 - if coreLimit and coreLimit > 0: - availableCores = max([availableCores, coreLimit]) - compressionCores = round(availableCores * compressionCoresPercentage) - compressionCores = max([compressionCores, 1]) - alignmentCores = availableCores - compressionCores - if alignmentCores < 1: - streaming = False - else: - streaming = True - if streaming: - bwaCommand = "bwa mem -t %s %s %s %s" %(alignmentCores, refGenome, forwardReads, reverseReads) - samtoolsCommand = "samtools view -b -@ %s -o %s" %(compressionCores, outputBAM) - combinedCommand = "%s | %s" %(bwaCommand, samtoolsCommand) - print("RUN: %s" %combinedCommand, flush=True) - if "MOCK" in os.environ: - mock = True - if not mock: - exitCode = os.system(combinedCommand) - else: - exitCode = 0 - print("MOCK RUN: %s" %combinedCommand) - print("Completed with status %s" %exitCode, flush=True) - if exitCode: - raise RuntimeError("Running alignment and compression returned a non-zero exit status") - else: - tempSAM = os.path.join(workingFolder, "temp.sam") - bwaCommand = "bwa mem %s %s %s > %s" % (refGenome, forwardReads, reverseReads, tempSAM) - samtoolsCommand = "samtools view -b -@ %s -o %s %s" % (compressionCores, outputBAM, tempSAM) - combinedCommand = "%s && %s && rm %s" % (bwaCommand, samtoolsCommand, tempSAM) - print("RUN: %s" % combinedCommand) - if not mock: - exitCode = os.system(combinedCommand) - else: - exitCode = 0 - print("MOCK RUN: %s" %combinedCommand) - print("Completed with status %s" % exitCode) - if exitCode: - raise RuntimeError("Running alignment and compression returned a non-zero exit status") - - -def bwaAlignSE(forwardReads:str, workingFolder:str, outputBAM:str, refGenome:str, coreLimit:int=None, compressionCoresPercentage:float=0.15, mock:bool=False): - import multiprocessing - import os - availableCores = multiprocessing.cpu_count() - 1 - if coreLimit and coreLimit > 0: - availableCores = max([availableCores, coreLimit]) - compressionCores = round(availableCores * compressionCoresPercentage) - compressionCores = max([compressionCores, 1]) - alignmentCores = availableCores - compressionCores - if alignmentCores < 1: - streaming = False - else: - streaming = True - if streaming: - bwaCommand = "bwa mem -t %s %s %s" %(alignmentCores, refGenome, forwardReads) - samtoolsCommand = "samtools view -b -@ %s -o %s" %(compressionCores, outputBAM) - combinedCommand = "%s | %s" %(bwaCommand, samtoolsCommand) - print("RUN: %s" %combinedCommand, flush=True) - if "MOCK" in os.environ: - mock = True - if not mock: - exitCode = os.system(combinedCommand) - else: - exitCode = 0 - print("MOCK RUN: %s" %combinedCommand) - print("Completed with status %s" %exitCode, flush=True) - if exitCode: - raise RuntimeError("Running alignment and compression returned a non-zero exit status") - else: - tempSAM = os.path.join(workingFolder, "temp.sam") - bwaCommand = "bwa mem %s %s > %s" % (refGenome, forwardReads, tempSAM) - samtoolsCommand = "samtools view -b -@ %s -o %s %s" % (compressionCores, outputBAM, tempSAM) - combinedCommand = "%s && %s && rm %s" % (bwaCommand, samtoolsCommand, tempSAM) - print("RUN: %s" % combinedCommand) - if not mock: - exitCode = os.system(combinedCommand) - else: - exitCode = 0 - print("MOCK RUN: %s" %combinedCommand) - print("Completed with status %s" % exitCode) - if exitCode: - raise RuntimeError("Running alignment and compression returned a non-zero exit status") \ No newline at end of file diff --git a/miqScoreShotgunPublicSupport/alignmentAnalysis/minimap2.py b/miqScoreShotgunPublicSupport/alignmentAnalysis/minimap2.py deleted file mode 100644 index 63fd8a3..0000000 --- a/miqScoreShotgunPublicSupport/alignmentAnalysis/minimap2.py +++ /dev/null @@ -1,42 +0,0 @@ -def minimapAlign(forwardReads:str, workingFolder:str, outputBAM:str, refGenome:str, coreLimit:int=None, compressionCoresPercentage:float=0.15, mock:bool=False): - import multiprocessing - import os - availableCores = multiprocessing.cpu_count() - 1 - if coreLimit and coreLimit > 0: - availableCores = max([availableCores, coreLimit]) - compressionCores = round(availableCores * compressionCoresPercentage) - compressionCores = max([compressionCores, 1]) - alignmentCores = availableCores - compressionCores - if alignmentCores < 1: - streaming = False - else: - streaming = True - if streaming: - minimapCommand = "minimap2 -L -ax map-ont -t %s %s %s" %(alignmentCores, refGenome, forwardReads) #the -L command clips CIGARs that are too long for BAM standards - samtoolsCommand = "samtools view -b -@ %s -o %s" %(compressionCores, outputBAM) - combinedCommand = "%s | %s" %(minimapCommand, samtoolsCommand) - print("RUN: %s" %combinedCommand, flush=True) - if "MOCK" in os.environ: - mock = True - if not mock: - exitCode = os.system(combinedCommand) - else: - exitCode = 0 - print("MOCK RUN: %s" %combinedCommand) - print("Completed with status %s" %exitCode, flush=True) - if exitCode: - raise RuntimeError("Running alignment and compression returned a non-zero exit status") - else: - tempSAM = os.path.join(workingFolder, "temp.sam") - minimapCommand = "minimap2 -L -ax map-ont %s %s > %s" % (refGenome, forwardReads, tempSAM) - samtoolsCommand = "samtools view -b -@ %s -o %s %s" % (compressionCores, outputBAM, tempSAM) - combinedCommand = "%s && %s && rm %s" % (minimapCommand, samtoolsCommand, tempSAM) - print("RUN: %s" % combinedCommand) - if not mock: - exitCode = os.system(combinedCommand) - else: - exitCode = 0 - print("MOCK RUN: %s" %combinedCommand) - print("Completed with status %s" % exitCode) - if exitCode: - raise RuntimeError("Running alignment and compression returned a non-zero exit status") \ No newline at end of file diff --git a/miqScoreShotgunPublicSupport/formatReaders/__init__.py b/miqScoreShotgunPublicSupport/formatReaders/__init__.py deleted file mode 100644 index 0d9712b..0000000 --- a/miqScoreShotgunPublicSupport/formatReaders/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from . import fastq -from . import qualityScore -from . import gzipIdentifier - -__all__ = ["fastq", - "qualityScore", - "gzipIdentifier"] \ No newline at end of file diff --git a/miqScoreShotgunPublicSupport/formatReaders/fastq/__init__.py b/miqScoreShotgunPublicSupport/formatReaders/fastq/__init__.py deleted file mode 100644 index 21f4fa4..0000000 --- a/miqScoreShotgunPublicSupport/formatReaders/fastq/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from . import fastqHandler -from . import fastqAnalysis -from . import fileNamingStandards - -__all__ = ["fastqHandler", - "fastqAnalysis", - "fileNamingStandards"] \ No newline at end of file diff --git a/miqScoreShotgunPublicSupport/formatReaders/fastq/fastqAnalysis.py b/miqScoreShotgunPublicSupport/formatReaders/fastq/fastqAnalysis.py deleted file mode 100644 index dcb217f..0000000 --- a/miqScoreShotgunPublicSupport/formatReaders/fastq/fastqAnalysis.py +++ /dev/null @@ -1,314 +0,0 @@ -import logging -logger = logging.getLogger(__name__) -from . import fileNamingStandards - -def buildQualityMatrix(path:str): - import numpy - from .fastqHandler import FastqFile - fastq = FastqFile(path, depth=1) - qualityMatrix = [] - for read in fastq: - qualityMatrix.append(read.quality.phredScores) - fastq.close() - return numpy.matrix(qualityMatrix, dtype='uint8') #Memory efficient, but if someone feeds in a phred score > 255, this will break. PacBio, I'm looking at you. - - -def buildQualityMatrixPaired(forward:str, reverse:str): - return buildQualityMatrix(forward), buildQualityMatrix(reverse) - - -def buildExpectedErrorMatrix(path:str, superLean:bool = False, startPosition:int = 0, subsample:int=0): - import numpy - from .. import qualityScore - from .fastqHandler import FastqFile - fastq = FastqFile(path, depth = 0, subsample = subsample) - expectedErrorMatrix = [] - dataType = 'float16' - if superLean: - dataType = 'uint8' - for line in fastq: - expectedErrorLineList = qualityScore.qualityScoreHandler.cumulativeExpectedErrorArray(line.quality, fastq.qualityScoreScheme)[startPosition:] - expectedErrorMatrix.append(expectedErrorLineList) #low precision floating point. Usually users are looking for whole numbers anyway - fastq.close() - return numpy.array(expectedErrorMatrix, dataType, order='F') - - -def buildExpectedErrorMatrixPaired(forward:str, reverse:str, superLean:bool = False, startPositions:tuple = (0, 0), subsample:int=0): - return buildExpectedErrorMatrix(forward, superLean, startPositions[0]), buildExpectedErrorMatrix(reverse, superLean, startPositions[1]) - - -def findCutoffByPercentile(path:str, phredScore:int, percentile:int): - ''' - This will analyze a fastq file to find where the given percentile of reads is at or below the given phred score (such as finding the read where the 10th percentile of reads is phred=10. - Value returned is the position *INDEXED TO ZERO* - :param path: path of the Fastq to analyze - :param phredScore: score to use in cutoff - :param percentile: percentile to use in cutoff - :return:base position (integer) - ''' - import numpy - qualityMatrix = buildQualityMatrix(path).transpose() #faster calclation of percentiles if we have positions as rows and reads as columns - for position, row in enumerate(qualityMatrix): - nthPercentile = numpy.percentile(row, percentile) - if nthPercentile < percentile: - return position - return numpy.size(qualityMatrix, 0) - - -def makeQualityMatrix(path:str): - import numpy - from . import fastqHandler - readLength, variance = fastqHandler.estimateReadLength(path, getVariance=True) - if variance != 0: - readLength = fastqHandler.getLongestReadInFile(path) - fastq = fastqHandler.FastqFile(path, depth=1) - qualityRange = fastq.qualityScoreScheme.range - readLengthMatrix = [0] * readLength - qualityCountMatrix = [] - for i in range(qualityRange + 1): - qualityCountMatrix.append(readLengthMatrix.copy()) - ''' - Building a matrix here where the correspond to all possibly quality scores and columns represent each base position of each read (indexed to zero) - Calling a specific value is done by qualityMatrix[qualityScore][readPosition] - ''' - for read in fastq: - for position, phred in enumerate(read.quality.phredScores): - qualityCountMatrix[phred][position] = qualityCountMatrix[phred][position] + 1 - fastq.close() - qualityCountMatrix = numpy.matrix(qualityCountMatrix) - return qualityCountMatrix - # plt.imshow(qualityCountMatrix, origin='lower', aspect='auto') - # plt.xlabel("Position") - # plt.ylabel("Quality (Phred)") - # plt.title("Read quality for %s" %path) - # if not testingOnly: - # if outputFile: - # plt.savefig(outputFile) - # else: - # plt.show() - # return qualityCountMatrix - - -def makeAverageExpectedErrorLine(path:str): - import numpy - expectedErrorMatrix = buildExpectedErrorMatrix(path) - expectedErrorMatrix = expectedErrorMatrix.transpose() - means = [] - for line in expectedErrorMatrix: - means.append(numpy.mean(line)) - return means - # plt.plot(means, 'k-') - # plt.xlabel("Position") - # plt.ylabel("Average Expected Error") - # plt.show() - - -def getDataForFastqPlots(forwardFastq:fileNamingStandards.NamingStandard, reverseFastq:fileNamingStandards.NamingStandard = None): - forwardQualityMatrix = makeQualityMatrix(forwardFastq.filePath) - forwardExpectedErrorLine = makeAverageExpectedErrorLine(forwardFastq.filePath) - if reverseFastq is None: - reverseQualityMatrix = None - reverseExpectedErrorLine = None - else: - reverseQualityMatrix = makeQualityMatrix(reverseFastq.filePath) - reverseExpectedErrorLine = makeAverageExpectedErrorLine(reverseFastq.filePath) - return forwardQualityMatrix, reverseQualityMatrix, forwardExpectedErrorLine, reverseExpectedErrorLine - - -def generateFastqPlotPaired(forwardFastq:fileNamingStandards.NamingStandard, reverseFastq:fileNamingStandards.NamingStandard, sampleTitle:str = None, outputFile:str = None, base64Format:str = None): - import matplotlib.pyplot as plt - if base64Format: - import base64 - if outputFile and base64Format: - outputFileFormat = outputFile.split(".")[-1] - if not outputFileFormat == base64Format: - logger.error( - "Cannot save plot in one format and return base64 in a different format. Returning file save format. Save in %s. Return base64 %s" % ( - outputFileFormat, base64Format)) - if sampleTitle is None: - sampleTitle = " ".join([str(item) for item in forwardFastq.sampleID]) - else: - sampleTitle = str(sampleTitle) - forwardQualityMatrix, reverseQualityMatrix, forwardExpectedErrorLine, reverseExpectedErrorLine = getDataForFastqPlots(forwardFastq, reverseFastq) - plt.suptitle("Analysis of %s" % sampleTitle, horizontalalignment="center", fontsize=18, fontweight="bold") - - #make plots for forward reads - plt.subplot(221) - plt.imshow(forwardQualityMatrix, origin='lower', aspect='auto') - plt.xlabel("Read 1 Position") - plt.ylabel("Quality (Phred)") - plt.title(" ", fontsize = 16) #making a whitespace buffer - plt.subplot(222) - plt.plot(forwardExpectedErrorLine, 'k-') - plt.xlabel("Read 1 Position") - plt.ylabel("Average Expected Error") - plt.title(" ", fontsize = 16) #making a whitespace buffer - - #make plots for reverse reads - plt.subplot(223) - plt.imshow(reverseQualityMatrix, origin='lower', aspect='auto') - plt.xlabel("Read 2 Position") - plt.ylabel("Quality (Phred)") - #plt.title("Read quality for %s" %reverseFastq.fileName) - plt.subplot(224) - plt.plot(reverseExpectedErrorLine, 'k-') - plt.xlabel("Read 2 Position") - plt.ylabel("Average Expected Error") - #plt.title("Expected error for %s" % reverseFastq.fileName) - - plt.tight_layout() - if outputFile: - plt.savefig(outputFile) - if base64Format: - imageFile = open(outputFile) - encodedFile = base64.b64encode(imageFile.read()) - imageFile.close() - return encodedFile - elif base64Format: - import io - byteStream = io.BytesIO() - plt.savefig(byteStream, format=base64Format) - byteStream.seek(0) - encodedFile = base64.b64encode(byteStream.read()) - return encodedFile - else: - plt.show() - - -def generateFastqPlotSingle(forwardFastq: fileNamingStandards.NamingStandard, sampleTitle: str = None, outputFile: str = None, base64Format:str = None): - import matplotlib.pyplot as plt - if base64Format: - import base64 - if outputFile and base64Format: - outputFileFormat = outputFile.split(".")[-1] - if not outputFileFormat == base64Format: - logger.error("Cannot save plot in one format and return base64 in a different format. Returning file save format. Save in %s. Return base64 %s" %(outputFileFormat, base64Format)) - if sampleTitle is None: - sampleTitle = " ".join([str(item) for item in forwardFastq.sampleID]) - else: - sampleTitle = str(sampleTitle) - forwardQualityMatrix, reverseQualityMatrix, forwardExpectedErrorLine, reverseExpectedErrorLine = getDataForFastqPlots(forwardFastq) - plt.suptitle(sampleTitle, horizontalalignment="center", fontsize = 18, fontweight = "bold") - - # make plots for reads - plt.subplot(211) - plt.imshow(forwardQualityMatrix, origin='lower', aspect='auto') - plt.xlabel("Position") - plt.ylabel("Quality (Phred)") - plt.title(" ", fontsize = 16) #making a whitespace buffer - plt.subplot(212) - plt.plot(forwardExpectedErrorLine, 'k-') - plt.xlabel("Position") - plt.ylabel("Average Expected Error") - - plt.tight_layout() - if outputFile: - plt.savefig(outputFile) - if base64Format: - imageFile = open(outputFile) - encodedFile = base64.b64encode(imageFile.read()) - imageFile.close() - return encodedFile - elif base64Format: - import io - byteStream = io.BytesIO() - plt.savefig(byteStream, format=base64Format) - byteStream.seek(0) - encodedFile = base64.b64encode(byteStream.read()) - return encodedFile - else: - plt.show() - - -class ParallelPlotAgent(object): - - def __init__(self, outputDirectory:str = None, base64Output:bool = False, outputFormat:str = None): - self.outputDirectory = outputDirectory - self.outputFormat = outputFormat - self.base64Output = base64Output - if outputDirectory or base64Output: - if not outputFormat: - raise ValueError("If output to file (directory) or base64 is set, an output format must be provided, but none was.") - - def parallelPlotter(self, fastq:[tuple, fileNamingStandards.NamingStandard]): - import os - if type(fastq) == tuple: - sampleName = "_".join([str(item) for item in fastq[0].sampleID]) - returnFastq = fastq[0] - else: - sampleName = "_".join([str(item) for item in fastq.sampleID]) - returnFastq = fastq - if self.outputDirectory: - outputFileName = os.path.join(self.outputDirectory, sampleName + ".%s" %self.outputFormat) - else: - outputFileName = None - if self.base64Output: - base64Format = self.outputFormat - else: - base64Format = None - if type(fastq) == tuple: - base64EncodedPlot = generateFastqPlotPaired(fastq[0], fastq[1], outputFile=outputFileName, base64Format=base64Format) - else: - base64EncodedPlot = generateFastqPlotSingle(fastq, outputFile=outputFileName, base64Format=base64Format) - return returnFastq, outputFileName, base64EncodedPlot #returnValue will be None unless a base64 encoded image was returned - - -def plotFastqFilesInFolder(directory:str, namingStandard:fileNamingStandards.NamingStandard, outputDirectory:str = None, base64Output:bool = False, outputFormat:str = None): - import os - from . import fastqHandler - from ... import easyMultiprocessing - if outputDirectory and not os.path.isdir(directory): - raise NotADirectoryError("Unable to find a directory at %s" %directory) - if outputDirectory and not os.path.isdir(outputDirectory): - raise NotADirectoryError("Unable to find a directory at %s" %outputDirectory) - if outputDirectory or base64Output: - if not outputFormat: - raise ValueError( - "If output to file (directory) or base64 is set, an output format must be provided, but none was.") - fastqTable = fastqHandler.getSamplePairTableFromFolder(directory, namingStandard) - fastqSetList = [] - for key in fastqTable: - if key == "unpaired": - for fastq in fastqTable["unpaired"]: - fastqSetList.append(fastq) - else: - fastqSetList.append(fastqTable[key]) - parallelPlotAgent = ParallelPlotAgent(outputDirectory=outputDirectory, base64Output=base64Output, outputFormat=outputFormat) - if outputDirectory or base64Output: - plotReturnValues = easyMultiprocessing.parallelProcessRunner(parallelPlotAgent.parallelPlotter, fastqSetList) - else: - plotReturnValues = [parallelPlotAgent.parallelPlotter(fastq) for fastq in fastqSetList] #can't do parallel plotting if plotting to a display window - returnTable = {} - if outputDirectory and base64Output: - for fastq, outputFile, base64EncodedPlot in plotReturnValues: - returnTable[fastq] = (outputFile, base64EncodedPlot) - elif base64Output: - for fastq, outputFile, base64EncodedPlot in plotReturnValues: - returnTable[fastq] = base64EncodedPlot - elif outputDirectory: - for fastq, outputFile, base64EncodedPlot in plotReturnValues: - returnTable[fastq] = base64EncodedPlot - return returnTable - - -def getEstimatedFastqFileSizeSumFromList(fastqList:list): - import os - from .. import gzipIdentifier - sum = 0 - for fastq in fastqList: - fileSize = os.path.getsize(fastq.filePath) - if gzipIdentifier.isGzipped(fastq.filePath): - fileSize = round(fileSize * 3.5) - sum += fileSize - return sum - -def getEstimatedFastqSizeSumFromDirectory(path:str): - import os - from . import fastqHandler - if not os.path.isdir(path): - raise NotADirectoryError("Unable to find a directory at %s" %path) - fastqList = fastqHandler.findSamplesInFolder(path) - return getEstimatedFastqFileSizeSumFromList(fastqList) - - - diff --git a/miqScoreShotgunPublicSupport/formatReaders/fastq/fastqHandler.py b/miqScoreShotgunPublicSupport/formatReaders/fastq/fastqHandler.py deleted file mode 100644 index af75c9c..0000000 --- a/miqScoreShotgunPublicSupport/formatReaders/fastq/fastqHandler.py +++ /dev/null @@ -1,553 +0,0 @@ -import os -import logging -import typing -logger = logging.getLogger(__name__) -from .. import qualityScore -from . import fileNamingStandards - -class ReadMetadataLine(object): - - def __init__(self, rawMetadata): - self.rawMetadata = rawMetadata - if not rawMetadata.startswith("@"): - logger.warning("Got a metadata line that did not start with an @ symobol. This goes against the fastq standard and may suggest a corrupt file. Line: %s" %rawMetadata) - metadataSplit = rawMetadata.strip().split(" ") - if not len(metadataSplit) == 2: - errorMessage = "Got a metadata line that appears to have more than two elements divided by space. %s" %rawMetadata - logger.critical(errorMessage) - raise FastqFormatError(errorMessage) - equipmentInfo, readInfo = metadataSplit - self.validEquipmentInfo = self.processEquipmentInfo(equipmentInfo, rawMetadata) - self.validReadInfo = self.processReadInfo(readInfo, rawMetadata) - self.allValidInfo = self.validEquipmentInfo and self.validReadInfo - - def processReadInfo(self, readInfo:str, rawMetadata:str=""): - validFields = True - readInfo = readInfo.split(":") - if not len(readInfo) == 4: - errorMessage = "Got a read info section of metadata that did not have 4 elements. Line: %s" %rawMetadata - logger.critical(errorMessage) - raise FastqFormatError(errorMessage) - self.direction, self.filtered, self.controlBits, self.index = readInfo - try: - self.direction = int(self.direction) - if self.direction not in [1, 2]: - validFields = False - logger.error("Read direction found that was not 1 or 2. Line: %s" %rawMetadata) - except ValueError: - validFields = False - logger.error("Read direction could not be cast to integer. Line: %s" %rawMetadata) - if self.filtered.upper() == "Y": - self.filtered = True - self.passedFilter = False - elif self.filtered.upper() == "N": - self.filtered = False - self.passedFilter = True - else: - self.passedFilter = None - validFields = False - logger.error("Got a value for filtered that was not Y or N. Line: %s" %rawMetadata) - try: - self.controlBits = int(self.controlBits) - if not self.controlBits % 2 == 0: - validFields = False - logger.error("Got a control bits value of %s. Control bits should be an even number. Line: %s " %(self.controlBits, rawMetadata)) - except ValueError: - validFields = False - logger.error("Unable to cast control bits to an integer. Line: %s " %rawMetadata) - return validFields - - def processEquipmentInfo(self, equipmentInfo:str, rawMetadata:str=""): - validFields = True - equipmentInfo = equipmentInfo.replace("@", "") - equipmentInfo = equipmentInfo.split(":") - if not len(equipmentInfo) == 7: - logger.critical("Equipment info section of metadata did not have 7 elements. Line: %s" %rawMetadata) - raise FastqFormatError("Equipment info section of metadata did not have 7 elements. Line: %s" %rawMetadata) - self.instrumentName, self.runID, self.flowcellID, self.tileNumber, self.laneNumber, self.xCoordinate, self.yCoordinate = equipmentInfo - try: - self.runID = int(self.runID) - except ValueError: - validFields = False - logger.error("Run ID number could not be cast to integer. Metadata line: %s" %rawMetadata) - try: - self.laneNumber = int(self.laneNumber) - except ValueError: - validFields = False - logger.error("Lane number could not be cast to integer. Metadata line: %s" %rawMetadata) - try: - self.tileNumber = int(self.tileNumber) - except ValueError: - validFields = False - logger.error("Tile number could not be cast to integer. Metadata line: %s" %rawMetadata) - try: - self.xCoordinate = int(self.xCoordinate) - except ValueError: - validFields = False - logger.error("X-coordinate could not be cast to integer. Metadata line: %s" %rawMetadata) - try: - self.yCoordinate = int(self.yCoordinate) - except ValueError: - validFields = False - logger.error("Y-coordinate could not be cast to integer. Metadata line: %s" %rawMetadata) - return validFields - - def __str__(self): - return self.rawMetadata - - -class QualityScoreLine(object): - - def __init__(self, rawQualityLine:str, base:int = 33): - self.qualityString = rawQualityLine - self.phredScores = self.calculatePhredScores(base) - - def calculatePhredScores(self, base:int = 33): - from .. import qualityScore - return qualityScore.qualityScoreHandler.convertToNumericArray(self.qualityString, base) - - def __str__(self): - return self.qualityString - - def __getitem__(self, item): - return self.phredScores[item] - - def __iter__(self): - for value in self.phredScores: - yield value - - -class SequenceLine(object): - - def __init__(self, rawSequence, runAnalysis:bool=False): - self.sequence = rawSequence.strip().upper().replace(".", "N") - self.length = len(self.sequence) - if runAnalysis: - self.baseFrequency = self.getBaseFrequencyTable() - self.gcContent = self.calculateGCContent() - - def getBaseFrequencyTable(self): - freq = {"A" : 0, - "G" : 0, - "C" : 0, - "T" : 0, - "N" : 0} - for base in self.sequence: - try: - freq[base] += 1 - except KeyError: - logger.error("Found a sequence with an invalid character. Character: %s Sequence: %s" %(base, self.sequence)) - return freq - - def calculateGCContent(self): - totalReadBases = 0 - gcBases = 0 - for base in "ATGC": - totalReadBases += self.baseFrequency[base] - if base in "GC": - gcBases += self.baseFrequency[base] - if totalReadBases == 0: - return 0 - return gcBases/totalReadBases - - def __len__(self): - return self.length - - def __str__(self): - return self.sequence - - def __eq__(self, other): - if type(other) == SequenceLine: - return self.sequence == other.sequence - elif type(other) == str: - return self.sequence == SequenceLine(other).sequence - else: - logger.critical("Attempted to compare a sequence to something that is not a sequence line type or string. Value in question was type %s: %s" %(type(other), other)) - - -class FastqLineSet(object): - - def __init__(self, metadata:str, sequence:str, spacer:str, quality:str, depth:int=0, analyzeMetadata:bool=False, analyzeSequence:bool=False, analyzeSequenceInDepth:bool=False, analyzeQuality:bool=False, qualityBase:int=33): - self.metadata = metadata.strip() - self.sequence = sequence.strip() - self.spacer = spacer.strip() - self.quality = quality.strip() - if depth >= 1 or analyzeQuality: - self.quality = QualityScoreLine(quality, qualityBase) - if depth >= 2 or analyzeSequence or analyzeSequenceInDepth: - if depth >= 4 or analyzeSequenceInDepth: - self.sequence = SequenceLine(self.sequence, runAnalysis=True) - else: - self.sequence = SequenceLine(self.sequence) - if depth >= 3 or analyzeMetadata: - self.metadata = ReadMetadataLine(self.metadata) - - def __str__(self): - return "%s\n%s\n%s\n%s" %(self.metadata, self.sequence, self.spacer, self.quality) - -def reanalyzeFastqLineSet(fastqLineSet:FastqLineSet, depth:int=0, analyzeMetadata:bool=False, analyzeSequence:bool=False, analyzeSequenceInDepth:bool=False, analyzeQuality:bool=False, qualityBase:int=33): - return FastqLineSet(str(fastqLineSet.metadata), - str(fastqLineSet.sequence), - str(fastqLineSet.spacer), - str(fastqLineSet.quality), - depth, analyzeMetadata, analyzeSequence, analyzeSequenceInDepth, analyzeQuality, qualityBase) - -class FastqFile(object): - - def __init__(self, path:str, depth:int=0, analyzeMetadata:bool=False, analyzeSequence:bool=False, analyzeSequenceInDepth:bool=False, analyzeQuality:bool=False, fullValidation:bool=False, qualityScoreScheme:[qualityScore.qualityScoreHandler.EncodingScheme, None]=None, subsample:int = 0): - self.path = path - if not os.path.isfile(path): - logger.critical("Unable to find fastq file at %s" %path) - raise FileNotFoundError("Unable to find fastq file at %s" %path) - follower = qualityScoreScheme - if not qualityScoreScheme: - qualityScoreScheme = findQualityScoreEncoding(path) - if type(qualityScoreScheme) == qualityScore.qualityScoreHandler.EncodingScheme: - self.qualityScoreScheme = qualityScoreScheme - else: - raise TypeError("Quality score scheme must be of qualityScoreHandler.EncodingScheme type. Passed: %s of type %s." %(qualityScoreScheme, type(qualityScoreScheme))) - self.depth = depth - self.analyzeMetadata = analyzeMetadata - self.analyzeSequence = analyzeSequence - self.analyzeSequenceInDepth = analyzeSequenceInDepth - self.analyzeQuality = analyzeQuality - self.fullValidation = fullValidation - self.reachedEnd = False - self.gzipped = self.checkGzip(path) - if self.gzipped: - import gzip - self.filehandle = gzip.open(path, "rt") - else: - self.filehandle = open(path, "r") - self.open = True - subsample = int(subsample) - if subsample == 0: - subsample = 1 - self.subsample = subsample - self.currentLine = 0 - - def checkGzip(self, path): - from .. import gzipIdentifier - return gzipIdentifier.isGzipped(path) - - def getNextRead(self): - - def read4Lines(): - readBuffer = [] - for i in range(4): - nextLine = self.filehandle.readline() - if not nextLine: - self.reachedEnd = True - break - nextLine = nextLine.strip() - if nextLine: - readBuffer.append(nextLine) - if self.reachedEnd: - if readBuffer: - logger.error( - "Fastq file at %s appears to me missing lines (found something not a multiple of 4." % self.path) - for i in range(4 - len(readBuffer)): - readBuffer.append("") - return readBuffer - - if not self.open: - logger.critical("Attempting to read from a closed fastq file at %s" %self.path) - raise ValueError("I/O operation on a closed file") - readBuffer = None - includedLine = False - while not includedLine: - readBuffer = read4Lines() - self.currentLine += 1 - includedLine = (self.currentLine - 1) % self.subsample == 0 or self.reachedEnd - if not readBuffer: - return readBuffer - else: - fastqLineSet = FastqLineSet(*readBuffer, depth=self.depth, analyzeMetadata=self.analyzeMetadata, analyzeSequence=self.analyzeSequence, analyzeSequenceInDepth=self.analyzeSequenceInDepth, analyzeQuality=self.analyzeQuality, qualityBase=self.qualityScoreScheme.base) - if self.fullValidation: - if not len(readBuffer[1]) == len(readBuffer[3]): - raise FastqValidationError("Got mismatched sequence and quality line lengths for line %s" %readBuffer) - if type(fastqLineSet.metadata) == str: - metadata = ReadMetadataLine(str(fastqLineSet.metadata)) - else: - metadata = fastqLineSet.metadata - if not metadata.allValidInfo: - raise FastqValidationError("Got some invalid metadata for line %s" %readBuffer) - return fastqLineSet - - def close(self): - if not self.filehandle.closed: - self.filehandle.close() - - def __iter__(self): - return self - - def __next__(self): - returnValue = self.getNextRead() - if self.reachedEnd: - self.close() - raise StopIteration - else: - return returnValue - - def __str__(self): - return "Fastq file object at %s" %self.path - - -class FastqFilePair(object): - - def __init__(self, pe1Path:str, pe2Path:str, depth:int=0, analyzeMetadata:bool=False, analyzeSequence:bool=False, analyzeSequenceInDepth:bool=False, analyzeQuality:bool=False, fullValidation:bool=False, qualityScoreScheme:qualityScore.qualityScoreHandler=None, subsample:int=0): - self.pe1Path = pe1Path - if not os.path.isfile(pe1Path): - logger.critical("Unable to find fastq file at %s" %pe1Path) - raise FileNotFoundError("Unable to find paired-end 1 fastq file at %s" %pe1Path) - self.pe2Path = pe2Path - if not os.path.isfile(pe2Path): - logger.critical("Unable to find fastq file at %s" %pe2Path) - raise FileNotFoundError("Unable to find paired-end 1 fastq file at %s" %pe2Path) - self.depth = depth - self.analyzeMetadata = analyzeMetadata - self.analyzeSequence = analyzeSequence - self.analyzeSequenceInDepth = analyzeSequenceInDepth - self.analyzeQuality = analyzeQuality - self.fullValidation = fullValidation - self.reachedEnd = False - if subsample == 0: - subsample = 1 - self.subsample = subsample - self.pe1FileHandle = FastqFile(pe1Path, depth=depth, analyzeMetadata=analyzeMetadata, analyzeSequence=analyzeSequence, analyzeSequenceInDepth=analyzeSequenceInDepth, analyzeQuality=analyzeQuality, fullValidation=fullValidation, qualityScoreScheme=qualityScoreScheme, subsample=subsample) - self.pe2FileHandle = FastqFile(pe2Path, depth=depth, analyzeMetadata=analyzeMetadata, analyzeSequence=analyzeSequence, analyzeSequenceInDepth=analyzeSequenceInDepth, analyzeQuality=analyzeQuality, fullValidation=fullValidation, qualityScoreScheme=qualityScoreScheme, subsample=subsample) - if not self.pe1FileHandle.qualityScoreScheme == self.pe2FileHandle.qualityScoreScheme: - logger.warning("Paired end files appear to have different quality score encodings. Pe1: %s:%s. Pe2: %s%s" %(self.pe1FileHandle.qualityScoreScheme, self.pe1FileHandle.path, self.pe2FileHandle.qualityScoreScheme, self.pe2FileHandle.path)) - self.open = True - self.reportedReadMismatch = False - - def getNextReadPair(self): - if not self.open: - logger.critical("Attempting to read from a closed fastq files at %s and %s" %(self.pe1Path, self.pe2Path)) - raise ValueError("I/O operation on a closed file") - nextPe1 = self.pe1FileHandle.getNextRead() - nextPe2 = self.pe2FileHandle.getNextRead() - if (nextPe1 and not nextPe2) or (not nextPe1 and nextPe2): - if nextPe1: - logger.error("Ran out of paired-end 2 reads with remaining paired-end 1 reads for file pair %s and %s" %(self.pe1Path, self.pe2Path)) - else: - logger.error("Ran out of paired-end 1 reads with remaining paired-end 2 reads for file pair %s and %s" %(self.pe1Path, self.pe2Path)) - if self.fullValidation: - raise FastqValidationError("Reached end of one paired-end file before the other. Files: %s and %s" %(self.pe1Path, self.pe2Path)) - if not nextPe1 and not nextPe2: - self.reachedEnd = True - return None - if nextPe1 and nextPe2 and self.fullValidation: - self.runValidation(nextPe1, nextPe2) - return nextPe1, nextPe2 - - def runValidation(self, pe1:FastqLineSet, pe2:FastqLineSet): - if type(pe1.metadata) == str: - pe1Metadata = ReadMetadataLine(str(pe1.metadata)) - elif type(pe1.metadata) == ReadMetadataLine: - pe1Metadata = pe1.metadata - else: - raise TypeError("Only able to compare metadata as string or metadata objects") - if type(pe2.metadata) == str: - pe2Metadata = ReadMetadataLine(str(pe2.metadata)) - elif type(pe1.metadata) == ReadMetadataLine: - pe2Metadata = pe2.metadata - else: - raise TypeError("Only able to compare metadata as string or metadata objects") - if not pe1Metadata.allValidInfo or not pe2Metadata.allValidInfo: - raise FastqValidationError("Got invalid metadata field for at least one read in paired end mates:\n%s\n%s" %(pe1, pe2)) - if not validPairedEndMetadata(pe1Metadata, pe2Metadata): - raise FastqValidationError("Got invalid metadata match for paired end mates:\n%s\n%s" %(pe1, pe2)) - - def close(self): - self.pe1FileHandle.close() - self.pe2FileHandle.close() - self.open = False - - def __iter__(self): - return self - - def __next__(self): - returnValue = self.getNextReadPair() - if self.reachedEnd: - raise StopIteration - else: - return returnValue - - def __str__(self): - return "Fastq file pair object at %s and %s" %(self.pe1Path, self.pe2Path) - - -class FastqValidationError(Exception): - pass - - -class FastqFormatError(Exception): - pass - - -def validPairedEndMetadata(pe1:ReadMetadataLine, pe2:ReadMetadataLine): - matchFields = ["instrumentName", - "runID", - "flowcellID", - "laneNumber", - "tileNumber", - "xCoordinate", - "yCoordinate", - "index"] - for field in matchFields: - pe1Value = getattr(pe1, field) - pe2Value = getattr(pe2, field) - if not pe1Value == pe2Value: - logger.error("Mismatch on %s" %matchFields) - return False - if not ((pe1.direction == 1 and pe2.direction == 2) or (pe2.direction == 1 and pe1.direction == 2)): - return False - return True - - -def validFastqFile(path:str): - readCount = 0 - fastq = FastqFile(path, fullValidation=True) - read = fastq.getNextRead() - while read: - try: - read = fastq.getNextRead() - readCount += 1 - except Exception as error: - logger.error(error) - return False - fastq.close() - return readCount - - -def validFastqPair(pe1Path:str, pe2Path:str): - readCount = 0 - fastqPair = FastqFilePair(pe1Path, pe2Path, fullValidation=True) - read = fastqPair.getNextReadPair() - while read: - try: - read = fastqPair.getNextReadPair() - readCount += 1 - except Exception as error: - logger.error(error) - fastqPair.close() - return False - fastqPair.close() - return readCount - - -def estimateReadLength(path:str, samplesize:int=100, getVariance = False): - lengths = [] - fastq = FastqFile(path) - read = fastq.getNextRead() - while read: - lengths.append(len(read.sequence)) - if len(lengths) >= samplesize: - break - read = fastq.getNextRead() - meanReadLength = sum(lengths)/len(lengths) - if getVariance: - import statistics - lengthVariance = statistics.variance(lengths) - return round(meanReadLength), lengthVariance - return round(meanReadLength) - - -def getLongestReadInFile(path:str): - longestReadLength = 0 - fastq = FastqFile(path) - for read in fastq: - if len(read.sequence) > longestReadLength: - longestReadLength = len(read.sequence) - fastq.close() - return longestReadLength - - -def countReads(path:str): - readCount = 0 - fastq = FastqFile(path) - read = fastq.getNextRead() - while read: - readCount += 1 - read = fastq.getNextRead() - fastq.close() - return readCount - - -def findQualityScoreEncoding(path:str, lineLimit:int=100): - from .. import qualityScore - candidates = qualityScore.qualityScoreHandler.makeEncodingTable() - for i in range(len(candidates)): - candidates[i].eliminated = False - fastq = FastqFile(path, qualityScoreScheme=qualityScore.qualityScoreHandler.encodingSchemes.sanger) - line = fastq.getNextRead() - lineCount = 0 - while line: - for candidate in candidates: - candidate.qualifyWithQualityString(line.quality) - remaining = len([scheme for scheme in candidates if not scheme.eliminated]) - lineCount += 1 - if lineLimit > 0: - if lineCount >= lineLimit: - break - if remaining == 0: - logger.error("No valid quality scoring scheme found for fastq file %s" %path) - fastq.close() - return None - elif remaining == 1: - break - for candidate in candidates: - if not candidate.eliminated: - del candidate.eliminated - fastq.close() - return candidate - - -def findSamplesInFolder(directory:str, namingStandard:typing.Type[fileNamingStandards.NamingStandard] = fileNamingStandards.ZymoServicesNamingStandard): - import os - if not os.path.isdir(directory): - raise NotADirectoryError("%s is not a directory or not found." % directory) - fastqFileInfoList = [] - expectedEndings = fileNamingStandards.expectedEndings - for item in os.listdir(directory): - isFastqFile = False - for expectedEnding in expectedEndings: - if item.endswith(expectedEnding): - isFastqFile = True - break - if not isFastqFile: - continue - filePath = os.path.join(directory, item) - fastqFileInfoList.append(namingStandard(filePath)) - return fastqFileInfoList - - -def getSamplePairTableFromFolder(directory:str, namingStandard:typing.Type[fileNamingStandards.NamingStandard] = fileNamingStandards.ZymoServicesNamingStandard): - def hasMate(fastq:fileNamingStandards.NamingStandard, potentialMates:list): - for potentialMate in potentialMates: - if fastq.sameSample(potentialMate): - return potentialMate - return False - allFastqs = findSamplesInFolder(directory, namingStandard) - pairedFastqs = {"unpaired":[]} - forwardFiles = [fastq for fastq in allFastqs if fastq.direction == 1] - reverseFiles = [fastq for fastq in allFastqs if fastq.direction == 2] - for fastq in forwardFiles: - foundMate = hasMate(fastq, reverseFiles) - if foundMate: - reverseFiles.remove(foundMate) - pairedFastqs[fastq.sampleID] = (fastq, foundMate) - else: - pairedFastqs["unpaired"].append(fastq) - for fastq in reverseFiles: - pairedFastqs["unpaired"].append(fastq) - if not pairedFastqs["unpaired"]: - del pairedFastqs["unpaired"] - return pairedFastqs - - - - -if __name__ == "__main__": - test = getSamplePairTableFromFolder("c:/Users/mweinstein/dada2big/input/sequence") \ No newline at end of file diff --git a/miqScoreShotgunPublicSupport/formatReaders/fastq/fileNamingStandards.py b/miqScoreShotgunPublicSupport/formatReaders/fastq/fileNamingStandards.py deleted file mode 100644 index 92965c5..0000000 --- a/miqScoreShotgunPublicSupport/formatReaders/fastq/fileNamingStandards.py +++ /dev/null @@ -1,75 +0,0 @@ -expectedEndings = [".fastq", ".fq", ".fastq.gz", ".fq.gz"] - -class NamingStandard(object): - - __slots__ = ["fileName", "fileDirectory", "filePath", "sampleNumber", "group", "direction", "sampleID"] - - def __init__(self, filePath:str): - self.filePath = filePath - self.fileDirectory, self.fileName = self.separateNameAndDirectory(filePath) - self.group, self.sampleNumber, self.direction = self.getSampleInfo(self.fileName) - self.sampleID = (self.group, self.sampleNumber) - - def separateNameAndDirectory(self, path:str): - import os - directory, name = os.path.split(path) - return directory, name - - def getSampleInfo(self, fileName:str): - raise RuntimeError("This function should always be getting overridden. If you see this, someone called the base class by mistake.") - - def sameSample(self, other): - if not isinstance(other, NamingStandard): - raise TypeError("Can only check for same sample in another naming standard type") - if self.group == other.group and self.sampleNumber == other.sampleNumber: - return True - return False - - def __str__(self): - return self.filePath - - def __hash__(self): - return hash(self.filePath) - - def __eq__(self, other): - return self.group == other.group and self.sampleNumber == other.sample and self.direction == other.direction - - def __ne__(self, other): - return not self.__eq__(other) - - def __xor__(self, other): - return self.sameSample(other) - - -class ZymoServicesNamingStandard(NamingStandard): - - def getSampleInfo(self, fileName:str): - baseName = fileName.split(".")[0] - group, sample, direction = baseName.split("_") - direction = int(direction.replace("R","")) - return group, sample, direction - - -class IlluminaStandard(NamingStandard): - - def getSampleInfo(self, fileName:str): - baseName = fileName.split(".")[0] - baseSplit = baseName.split("_") - group = baseSplit[0] - sample = baseSplit[1] - direction = int(baseSplit[2].replace("R","")) - return group, sample, direction - - -class ManualNamingStandard(NamingStandard): - __slots__ = ["fileName", "fileDirectory", "filePath", "sampleNumber", "group", "direction", "sampleID"] - - def __init__(self, filePath: str, group:str, number:int, direction:int): - self.filePath = filePath - self.fileDirectory, self.fileName = self.separateNameAndDirectory(filePath) - self.group = group - self.sampleNumber = number - self.direction = direction - if direction not in [1, 2]: - raise ValueError("Read direction must be either 1 or 2. %s was given" %direction) - self.sampleID = (self.group, self.sampleNumber) \ No newline at end of file diff --git a/miqScoreShotgunPublicSupport/formatReaders/gzipIdentifier.py b/miqScoreShotgunPublicSupport/formatReaders/gzipIdentifier.py deleted file mode 100644 index 4674712..0000000 --- a/miqScoreShotgunPublicSupport/formatReaders/gzipIdentifier.py +++ /dev/null @@ -1,18 +0,0 @@ -import os -import gzip -import binascii - -def isGzipped(path:str): - if not os.path.isfile(path): - raise FileNotFoundError("Unable to determine if file %s is gzipped because that file does not exist." %path) - file = open(path, 'rb') - firstTwoBytes = file.read(2) - file.close() - if not binascii.hexlify(firstTwoBytes) == b'1f8b': - return False - try: - file = gzip.open(path, 'rb') - tenBytes = file.read(10) - except OSError: - return False - return True diff --git a/miqScoreShotgunPublicSupport/formatReaders/qualityScore/__init__.py b/miqScoreShotgunPublicSupport/formatReaders/qualityScore/__init__.py deleted file mode 100644 index 5f50203..0000000 --- a/miqScoreShotgunPublicSupport/formatReaders/qualityScore/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from . import qualityScoreHandler - -__all__ = ["qualityScoreHandler"] \ No newline at end of file diff --git a/miqScoreShotgunPublicSupport/formatReaders/qualityScore/qualityScoreHandler.py b/miqScoreShotgunPublicSupport/formatReaders/qualityScore/qualityScoreHandler.py deleted file mode 100644 index 0c7f4b6..0000000 --- a/miqScoreShotgunPublicSupport/formatReaders/qualityScore/qualityScoreHandler.py +++ /dev/null @@ -1,165 +0,0 @@ -import logging -import math -import typing -logger = logging.getLogger(__name__) - -class EncodingScheme(object): - - def __init__(self, name:str, base:int, startCharacter:str, endCharacter:str, pErrorToScore:typing.Callable, scoreToPError:typing.Callable): - self.name = name - self.base = base - self.characterSet = self.makeCharacterSet(startCharacter, endCharacter) - self.range = self.calculateRange(startCharacter, endCharacter) - self.fromPErrorFormula = pErrorToScore - self.toPErrorFormula = scoreToPError - - def makeCharacterSet(self, start:str, end:str): - rangeStart = ord(start) - rangeEnd = ord(end) + 1 - return [chr(asciiValue) for asciiValue in range(rangeStart, rangeEnd)] - - def calculateRange(self, start:str, end:str): - rangeStart = ord(start) - rangeEnd = ord(end) - return rangeEnd - rangeStart - - def toPError(self, score:[int, str]): - if type(score) == str: - if len(score) == 1: - score = convertCharacterToScore(score, self.base) - else: - logger.critical("Attempt to convert multiple characters to error probability. Function can only handle one conversion per call.") - raise ValueError("Attempt to get pError for entire string. Need one value at a time. String: %s" %score) - return self.toPErrorFormula(score) - - def scoreFromPError(self, pError:float, round:bool=True): - return self.fromPErrorFormula(pError, round) - - def encodedFromPError(self, pError:float): - return chr(self.scoreFromPError(pError, round=True) + self.base) - - def qualifyWithQualityString(self, qualityString:str): - try: - throwaway = self.eliminated - except AttributeError: - self.eliminated = False - if not self.eliminated: - qualityString = str(qualityString) - for character in qualityString: - if not character in self.characterSet: - self.eliminated = True - break - - def __str__(self): - return self.name - - def __eq__(self, other:[str]): - if not type(other) in [str, EncodingScheme]: - raise TypeError("Unable to compare encoding scheme types with anything but string or other EncodingScheme objects") - return self.name == str(other) - - -def makeEncodingTable(): - encodingTable = [ # In order of likelihood - EncodingScheme("Sanger/Illumina 1.8+", 33, "!", "I", pErrorToPhred, phredToPError), - EncodingScheme("Illumina 1.8+", 33, "!", "J", pErrorToPhred, phredToPError), - EncodingScheme("Illumina 1.5-7", 64, "B", "i", pErrorToPhred, phredToPError), - EncodingScheme("Illumina 1.3-4", 64, "@", "h", pErrorToPhred, phredToPError), - EncodingScheme("Solexa", 64, ";", "h", pErrorToSolexa, solexaToPError), - EncodingScheme("Pacbio", 33, "!", "~", pErrorToPhred, phredToPError) - ] - return encodingTable - -def convertCharacterToScore(character, base:int=33): - return ord(character) - base - - -def convertToNumericArray(qualityString, base: int = 33): - phredScores = [] - for character in qualityString: - phredScores.append(convertCharacterToScore(character, base)) - return tuple(phredScores) - - -def pErrorToPhred(pError:float, roundValue:bool=True): - score = -10 * (math.log(pError, 10)) - if roundValue: - score = round(score) - return score - - -def phredToPError(phred:[int, float]): - return 10 ** (-phred/10) - - -def pErrorToSolexa(pError:float, roundValue:bool=True): #google the definition of "arcane" - score = -10 * (math.log(pError/(1-pError), 10)) - if roundValue: - score = round(score) - return score - - -def solexaToPError(solexa:[int, float]): #seriously, who uses this encoding anymore, and who realizes that it's a slightly different formula? - return 1 / ((10 ** (solexa/10)) + 1) #Let's hope I don't have to derive that one again - - -class _Encodings(object): - - def __init__(self): - self.sanger = EncodingScheme("Sanger/Illumina 1.8+", 33, "!", "I", pErrorToPhred, phredToPError) - self.illumina = EncodingScheme("Illumina 1.8+", 33, "!", "J", pErrorToPhred, phredToPError) - self.illumina1_8 = self.illumina - self.illumina1_5 = EncodingScheme("Illumina 1.5-7", 64, "B", "i", pErrorToPhred, phredToPError) - self.illumina1_3 = EncodingScheme("Illumina 1.3-4", 64, "@", "h", pErrorToPhred, phredToPError) - self.solexa = EncodingScheme("Solexa", 64, ";", "h", pErrorToSolexa, solexaToPError) - self.pacbio = EncodingScheme("Pacbio", 33, "!", "~", pErrorToPhred, phredToPError) - - -encodingSchemes = _Encodings() - - -def cumulativeExpectedErrorArray(qualityString:str, encoding:EncodingScheme=encodingSchemes.illumina): - cumulativeExpectedErrorArray = [] - cumulativeExpectedError = 0.0 #ask me no questions, I'll tell you no lies/errors - qualityString = str(qualityString) - for character in qualityString: - cumulativeExpectedError += encoding.toPError(character) - cumulativeExpectedErrorArray.append(cumulativeExpectedError) - return cumulativeExpectedErrorArray - - -def cumulativeExpectedErrorArrayDada2Exact(qualityString:str, encoding:EncodingScheme=encodingSchemes.illumina): - cumulativeExpectedErrorArray = [] - cumulativeExpectedError = 0.0 #ask me no questions, I'll tell you no lies/errors - qualityString = str(qualityString) - for character in qualityString: - score = ord(character) - encoding.base - cumulativeExpectedError += 10 ** (-score/10) - cumulativeExpectedErrorArray.append(cumulativeExpectedError) - return cumulativeExpectedErrorArray - - -def convertQualityString(qualityString:str, inputScheme:EncodingScheme, outputScheme:EncodingScheme): - qualityString = str(qualityString) - if inputScheme.fromPErrorFormula == outputScheme.fromPErrorFormula: - baseDifference = inputScheme.base - outputScheme.base - outputString = "" - for character in qualityString: - outputString += chr(ord(character) - baseDifference) - return outputString - else: - outputString = "" - for character in qualityString: - pError = inputScheme.toPError(character) - outputString += outputScheme.encodedFromPError(pError) - return outputString - - -if __name__ == "__main__": - test = makeEncodingTable() - convert = encodingSchemes.illumina.encodedFromPError(0.0001) - func1 = phredToPError == phredToPError - func2 = solexaToPError == phredToPError - testString = "CCCCCGG7FGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGFFGGGGGG9EGGF8F8FGGFGGGFGGFGGGGFG8FFGEGGGGGGGEFGGGGGGCFGGFFGEGEDFGGGGCFGFGG9,CACGGGGEGGEGGFGFGFGGEEGGGF8EGDDGGGGFGGGFFGGEGGGD*:>ECCB:AFG>)::+@>CFFG?FFD><>FE8DFF>>F31CEC*1<)9FF=**.68*:F self.upperBound: - logger.warning("Got an out of bounds parameter set: %s set at %s. LowerBound: %s UpperBound: %s" %(self.name, value, self.lowerBound, self.upperBound)) - if self.usingValidationList: - if value not in self.validationList: - logger.warning("Got a parameter being set that is not on the validation list: %s set at %s. ValidationList: %s" %(self.name, value, self.validationList)) - if self.logLevel: - value = self.setLogLevel(value) - self.value = value - return usingEnvironment - - def setLogLevel(self, value): - valueTable = {"DEBUG" : logging.DEBUG, - "INFO" : logging.INFO, - "WARNING" : logging.WARNING, - "ERROR" : logging.ERROR, - "CRITICAL" : logging.CRITICAL} - return valueTable[value] - - def setBooleanValue(self, value:str): - if not value: - return False - elif self.value in ["FALSE", "false", "False", "0", 0]: - return False - else: - return True - - def setType(self, value): - if type(self.typeRequirement) == type: - try: - return self.typeRequirement(value) - except Exception as err: - logMessage = "Attempting to cast environment variable %s with value %s to type %s resulted in an exception as follows: \n%s" %(self.name, value, self.typeRequirement, err) - logger.exception(logMessage) - raise ArgumentTypeValidationError() - if type(self.typeRequirement) == type: - allowedTypes = [self.typeRequirement] - else: - allowedTypes = self.typeRequirement - allowedTypes = [self.typeRequirement] - for valueType in typeHeirarchy: - if valueType in allowedTypes: - try: - return valueType(value) - except: #Using a universal catch here, since I'm using this as a test and expect it to fail regularly - continue - logger.error("Unable to cast environment variable parameter %s to one of its required types: %s. Env variable value: %s" %(self.name, self.typeRequirement, value)) - raise ArgumentTypeValidationError("Unable to cast environment variable parameter %s to one of its required types: %s. Env variable value: %s" %(self.name, self.typeRequirement, value)) - - def formArgument(self): - if not self.isArgument: - return "" - if self.value is None: - return "" - if self.typeRequirement == bool: - return self.parseBooleanArg() - if self.positionalArg: - return str(self.value) - if type(self.value) == str and self.value.startswith("="): - return "%s%s" %(self.flag, self.value) - else: - return "%s %s" %(self.flag, self.value) - - def parseBooleanArg(self): - if not self.value: - return "" - elif self.value in ["FALSE", "false", "False", "0", 0]: - return "" - else: - return self.flag - - def overview(self): - returnDict = {"name":self.name, "type":type(self.value), "value":self.value, "flag":self.flag} - return str(returnDict) - - def __str__(self): - return str(self.value) - - def __eq__(self, other): - return self.value == other - - def __bool__(self): - if type(self.value) == bool: - return self.value - else: - return not self.value is None - - -class ParameterSideLoad(EnvVariable): - - def setEnvironmentValue(self): - return False - - -class EnvParameters(object): - """ - How to use: - Initialize an empty parameter set with myInstance = EnvParameters() - Add values that check environment variable using the following syntax: - myInstance.addParameter(name, type, [optional values]) - add values in directly using the side load method when additional logic at checking time is required: - myInstance.sideloadParameter(name, value, [optional values]) - """ - def __init__(self): - self.parameters = {} - self.variableNames = set() - self.flags = set() - - def addParameter(self, name:str, typeRequirement:[type, list, tuple], default=None, flag:[int, str]=None, validationList:list=None, lowerBound:[int,float]=None, upperBound:[int,float]=None, expectedFile:bool=False, createdFile:bool=False, expectedDirectory:bool=False, createdDirectory:bool=False, logLevel:bool=False, required:bool = False, externalValidation:bool=False): - parameter = EnvVariable(name, typeRequirement, default, flag, validationList, lowerBound, upperBound, expectedFile, createdFile, expectedDirectory, createdDirectory, logLevel, required, externalValidation) - if not parameter.environmentVariableName in self.variableNames: - self.variableNames.add(parameter.environmentVariableName) - else: - logger.critical("Environment variable name collision for %s" %parameter.environmentVariableName) - raise ArgumentValueException("Environment variable name collision for %s" %parameter.environmentVariableName) - if parameter.isArgument: - if not parameter.flag in self.flags: - self.flags.add(parameter.flag) - else: - logger.critical("Environment variable argument flag collision for %s" %parameter.flag) - raise ArgumentValueException("Environment variable flag collision for %s" %parameter.flag) - self.parameters[parameter.name] = parameter - - def sideLoadParameter(self, name:str, value, flag:[int, str]=None, expectedFile:bool=False, createdFile:bool=False, expectedDirectory:bool=False, createdDirectory:bool=False): - parameter = ParameterSideLoad(name, type(value), default=value, validationList=[value], expectedFile=expectedFile, createdFile=createdFile, expectedDirectory=expectedDirectory, createdDirectory=createdDirectory) - if not parameter.environmentVariableName in self.variableNames: - self.variableNames.add(parameter.environmentVariableName) - else: - logger.critical("Environment variable name collision for %s on side load" %parameter.environmentVariableName) - raise ArgumentValueException("Environment variable name collision for %s" %parameter.environmentVariableName) - if parameter.isArgument: - if not parameter.flag in self.flags: - self.flags.add(parameter.flag) - else: - logger.critical("Environment variable argument flag collision for %s" %parameter.flag) - raise ArgumentValueException("Environment variable flag collision for %s" %parameter.flag) - self.parameters[parameter.name] = parameter - - def buildFlaggedArgumentString(self): - flaggedArgs = [] - for key in self.parameters: - parameter = self.parameters[key] - if parameter.isArgument and not parameter.positionalArg: - flaggedArgs.append(parameter.formArgument()) - return " ".join(flaggedArgs) - - def buildPositionalArgumentStrings(self): - import operator - prependArgs = [] - appendArgs = [] - for key in self.parameters: - parameter = self.parameters[key] - if parameter.isArgument and parameter.positionalArg: - if parameter.flag >= 0: - prependArgs.append(parameter) - else: - appendArgs.append(parameter) - if not (prependArgs or appendArgs): - return ("", "") - if prependArgs: - prependArgs.sort(key=operator.attrgetter("flag")) - prependArgs = [arg.formArgument() for arg in prependArgs] - if appendArgs: - appendArgs.sort(key=operator.attrgetter("flag")) - appendArgs = [arg.formArgument() for arg in appendArgs] - prependArgString = " ".join(prependArgs) - appendArgString = " ".join(appendArgs) - return (prependArgString, appendArgString) - - def buildArgString(self): - beginning, end = self.buildPositionalArgumentStrings() - middle = self.buildFlaggedArgumentString() - argList = [item for item in [beginning, middle, end] if item] - return " ".join(argList) - - def checkCreatedFileStructures(self): - for key in self.parameters: - parameter = self.parameters[key] - if parameter.createdDirectory: - parameter.createDirectory() - for key in self.parameters: - parameter = self.parameters[key] - if parameter.createdFile: - parameter.validateCreatedFile() - - def __getattr__(self, item): - if item in self.parameters: - return self.parameters[item] - else: - if not type(item) == str: - print(list(self.parameters.keys())) - raise AttributeError("No parameter %s was found in the parameter set" %item) - for key in self.parameters: - if not type(key) == str: - continue - keylower = key.lower() - itemlower = item.lower() - if keylower == itemlower: - return self.parameters[key] - print(list(self.parameters.keys())) - raise AttributeError("No parameter %s was found in the parameter set" %item) - - -class ArgumentTypeValidationError(Exception): - pass - -class ArgumentValueException(Exception): - pass - -class EnvironmentVariableParameterException(Exception): - pass - -def assertionFails(bool_:bool): - try: - assert bool_, "Critical assertion failed." - except AssertionError: - return True - else: - return False - -if __name__ == "__main__": - test = EnvParameters() - test.addParameter("first", str, default="The first", validationList=["The first"]) - test.sideLoadParameter("sideload", "The side loaded one") - test.addParameter("second", str, "The second one", validationList=["The second one"]) \ No newline at end of file diff --git a/miqScoreShotgunPublicSupport/parameters/test_parameterParser.py b/miqScoreShotgunPublicSupport/parameters/test_parameterParser.py deleted file mode 100644 index 2a2162d..0000000 --- a/miqScoreShotgunPublicSupport/parameters/test_parameterParser.py +++ /dev/null @@ -1,25 +0,0 @@ -import os -from pytest import mark - -def setEnvironmentForTest(): - os.environ["INTEGER"] = "42" - os.environ["FLOAT"] = "4.2" - os.environ["STRING"] = "text" - os.environ["NEGBOOL"] = "False" - os.environ["POSBOOL"] = "True" - -@mark.build -@mark.parameters -def test_parameterParse(): - from .. import parameters - parameterSet = parameters.EnvParameters() - parameterSet.addParameter("integer", int, default=0) - parameterSet.addParameter("float", float, default=0.0) - parameterSet.addParameter("string", str, default="override") - parameterSet.addParameter("negbool", bool, default=True) - parameterSet.addParameter("posbool", bool, default=False) - assert parameterSet.integer == 42 - assert parameterSet.float == 4.2 - assert parameterSet.string == "text" - assert parameterSet.negbool == False - assert parameterSet.posbool == True diff --git a/miqScoreShotgunPublicSupport/projectData/__init__.py b/miqScoreShotgunPublicSupport/projectData/__init__.py deleted file mode 100644 index 752af2e..0000000 --- a/miqScoreShotgunPublicSupport/projectData/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -__all__ = ["microbiome"] - -from . import microbiome \ No newline at end of file diff --git a/miqScoreShotgunPublicSupport/projectData/generics.py b/miqScoreShotgunPublicSupport/projectData/generics.py deleted file mode 100644 index 99cf162..0000000 --- a/miqScoreShotgunPublicSupport/projectData/generics.py +++ /dev/null @@ -1,12 +0,0 @@ -class InputFile(object): - - def __init__(self, filePath:str): - import os - filePath = os.path.abspath(filePath) - if not os.path.isfile(filePath): - raise FileNotFoundError("Unable to find input file at %s" %filePath) - self.filePath = filePath - self.parseInputFile() - - def parseInputFile(self): - raise RuntimeError("This was always meant to be overridden and should not be getting hit during the program. This is a bug.") \ No newline at end of file diff --git a/miqScoreShotgunPublicSupport/projectData/microbiome/__init__.py b/miqScoreShotgunPublicSupport/projectData/microbiome/__init__.py deleted file mode 100644 index 809152b..0000000 --- a/miqScoreShotgunPublicSupport/projectData/microbiome/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -__all__ = ["sixteenS", - "dada2Outputs"] - -from . import sixteenS -from . import dada2Outputs \ No newline at end of file diff --git a/miqScoreShotgunPublicSupport/projectData/microbiome/dada2Outputs.py b/miqScoreShotgunPublicSupport/projectData/microbiome/dada2Outputs.py deleted file mode 100644 index 12cb123..0000000 --- a/miqScoreShotgunPublicSupport/projectData/microbiome/dada2Outputs.py +++ /dev/null @@ -1,138 +0,0 @@ -import os -import logging -logger = logging.getLogger(__name__) - -class Dada2AmpliconCount(object): - - def __init__(self, path:str): - if not os.path.isfile(path): - logger.critical("Tried to load dada2 amplicon count table, but could not find file at %s" %path) - raise FileNotFoundError("Unable to find file %s" %path) - self.path = path - self.ampliconTable = self.readTable(path) - self.readCount = self.getReadCount() - self.ampliconList = tuple(self.ampliconTable.keys()) - - def readTable(self, path): - import csv - rawTable = [] - file = open(path, 'r') - csvHandle = csv.reader(file) - for line in csvHandle: - rawTable.append(line[1:]) - file.close() - zipped = zip(rawTable[0], rawTable[1]) - ampliconTable = {} - for line in zipped: - amplicon, count = line - ampliconTable[amplicon] = int(count) - return ampliconTable - - def getReadCount(self): - readCount = 0 - for amplicon in self.ampliconTable: - readCount += self.ampliconTable[amplicon] - return readCount - - def __iter__(self): - for amplicon in self.ampliconList: - yield amplicon - - def __getitem__(self, item): - return self.ampliconTable[item] - - def __str__(self): - return "Dada2 amplicon table. %s amplicons. %s reads. From %s" %(len(self.ampliconList), self.readCount, self.path) - - -class Dada2GenusSpeciesCalls(object): - - def __init__(self, path:str): - if not os.path.isfile(path): - logger.critical("Tried to load dada2 amplicon count table, but could not find file at %s" %path) - raise FileNotFoundError("Unable to find file %s" %path) - self.path = path - self.callTable = self.makeSequenceCallTable() - self.ampliconList = list(self.callTable.keys()) - - def makeSequenceCallTable(self): - file = open(self.path, "r") - callTable = {} - import csv - csvHandle = csv.reader(file) - for line in csvHandle: - sequence, genus, species = line - if not sequence: - continue - callTable[sequence] = (genus, species) - return callTable - - def __iter__(self): - for amplicon in self.ampliconList: - yield amplicon - - def __getitem__(self, item): - return self.callTable[item] - - def __str__(self): - return "Dada2 genus/species call table. %s amplicons. From %s" %(len(self.ampliconList), self.path) - - -class Dada2KingdomGenusCalls(object): - - def __init__(self, path:str): - if not os.path.isfile(path): - logger.critical("Tried to load dada2 amplicon count table, but could not find file at %s" %path) - raise FileNotFoundError("Unable to find file %s" %path) - self.path = path - self.callTable = self.makeSequenceCallTable() - self.ampliconList = list(self.callTable.keys()) - - def makeSequenceCallTable(self): - file = open(self.path, "r") - callTable = {} - import csv - csvHandle = csv.reader(file) - for line in csvHandle: - sequence, kingdom, phylum, className, order, family, genus = line - if not sequence: - continue - callTable[sequence] = (kingdom, phylum, className, order, family, genus) - return callTable - - def __iter__(self): - for amplicon in self.ampliconList: - yield amplicon - - def __getitem__(self, item): - return self.callTable[item] - - def __str__(self): - return "Dada2 genus/species call table. %s amplicons. From %s" %(len(self.ampliconList), self.path) - - -def getHitCountByTaxa(ampliconCountTable:[str, Dada2AmpliconCount], taxaCallTable:[str, Dada2GenusSpeciesCalls, Dada2KingdomGenusCalls], genusSpeciesTable:bool = False): - hitCountTable = {} - if type(ampliconCountTable) == str: - ampliconCountTable = Dada2AmpliconCount(ampliconCountTable) - if type(taxaCallTable) == str: - if not genusSpeciesTable: - taxaCallTable = Dada2KingdomGenusCalls(taxaCallTable) - else: - taxaCallTable = Dada2GenusSpeciesCalls(taxaCallTable) - for sequence in ampliconCountTable: - if not sequence in taxaCallTable: - continue - taxa = taxaCallTable[sequence] - if not taxa in hitCountTable: - hitCountTable[taxa] = 0 - hitCountTable[taxa] += ampliconCountTable[sequence] - return hitCountTable - - -if __name__ == "__main__": - chimeras = Dada2AmpliconCount("in1055_1.SV.csv") - chimeraFree = Dada2AmpliconCount("in1055_1.SV.nochimera.csv") - taxa = Dada2KingdomGenusCalls("in1055_1.SV.taxa.csv") - hitCounts = getHitCountByTaxa(chimeraFree.ampliconTable, taxa.callTable) - print("something") \ No newline at end of file diff --git a/miqScoreShotgunPublicSupport/projectData/microbiome/sixteenS/__init__.py b/miqScoreShotgunPublicSupport/projectData/microbiome/sixteenS/__init__.py deleted file mode 100644 index 174f5ea..0000000 --- a/miqScoreShotgunPublicSupport/projectData/microbiome/sixteenS/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -__all__ = ["metadata"] - -from . import metadata \ No newline at end of file diff --git a/miqScoreShotgunPublicSupport/projectData/microbiome/sixteenS/metadata/__init__.py b/miqScoreShotgunPublicSupport/projectData/microbiome/sixteenS/metadata/__init__.py deleted file mode 100644 index 51c7c0c..0000000 --- a/miqScoreShotgunPublicSupport/projectData/microbiome/sixteenS/metadata/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -from . import masterTable -from . import pipelineParameters - -__all__ = ["masterTable", - "pipelineParameters"] - - - - -def crossValidationPassed(sampleData: masterTable.MasterTable, parameters: pipelineParameters.PipelineParameters): - for line in sampleData: - if not line.seqType in parameters: - return False - return True \ No newline at end of file diff --git a/miqScoreShotgunPublicSupport/projectData/microbiome/sixteenS/metadata/masterTable.py b/miqScoreShotgunPublicSupport/projectData/microbiome/sixteenS/metadata/masterTable.py deleted file mode 100644 index 3920ef7..0000000 --- a/miqScoreShotgunPublicSupport/projectData/microbiome/sixteenS/metadata/masterTable.py +++ /dev/null @@ -1,186 +0,0 @@ -from .... import generics -from ....utilities import validations - -class GroupInfo(object): - - def __init__(self, seqType:str): - self.seqType = seqType - self.members = [] - - def append(self, member:str): - self.members.append(member) - - def __iter__(self): - for member in self.members: - yield member - - def __str__(self): - return "%s: %s" %(self.seqType, self.members) - - -class MasterTable(generics.InputFile): - - def parseInputFile(self): - import csv - inputList = [] - masterTableFile = open(self.filePath, 'r') - masterTable = csv.reader(masterTableFile) - for line in masterTable: - if line[0].startswith("#"): - self.categoryTitles = line[6:] - continue - inputList.append(MasterTableLine(line)) - masterTableFile.close() - if not self.validTableData(inputList): - raise ValueError("Got invalid sample data in %s" %inputList) - self.samples = inputList.copy() - self.errorModelGroups = self.getErrorModelGroups() - self.groupInfo = self.makeGroupInfoTable() - self.lookupTable = self.makeLookupTable() - self.categoryCount, self.categorySpace = self.getGroupInfo() - - def makeLookupTable(self): - lookupTable = {} - for index, sample in enumerate(self.samples): - lookupName = "%s-%s" %(sample.sampleID, sample.sampleLabel) - lookupTable[lookupName] = index - return lookupTable - - def makeGroupInfoTable(self): - groupInfo = {} - for sample in self.samples: - if not sample.groupID in groupInfo: - groupInfo[sample.groupID] = GroupInfo(sample.seqType) - groupInfo[sample.groupID].append(sample.sampleLabel) - return groupInfo - - def getErrorModelGroups(self): - errorModelGroups = set() - for sample in self.samples: - errorModelGroups.add(sample.errorModelGroup) - return errorModelGroups - - def validTableData(self, table): - uniqueIdentifiers = [line.identifier for line in table] - testSet = set() - for identifier in uniqueIdentifiers: - if not identifier in testSet: - testSet.add(identifier) - else: - raise ValueError("Found a duplicate Group ID and Unique Label for %s" %identifier) - return True - - def getGroupInfo(self): - categorySpace = [] - categoryCount = len(self.samples[0].categories) - for i in range(categoryCount): - categorySpace.append(set()) - for sample in self.samples: - if not len(sample.categories) == categoryCount: - raise ValueError("Found a line with a different number of categories than others. Please check formatting of the input csv.\nCSV: %s\nLine: %s" %(self.filePath, sample)) - for index, category in enumerate(sample.categories): - categorySpace[index].add(category) - return categoryCount, categorySpace - - def __getitem__(self, item): - if type(item) == int: - return self.samples[item] - else: - return self.samples[self.lookupTable[item]] - - def __iter__(self): - for line in self.samples: - yield line - - def __str__(self): - output = "" - for line in self.samples: - output += str(line) + "\n" - return output - - -class MasterTableLine(object): - - def __init__(self, lineArray:list): - if not len(lineArray) >= 6: - raise ValueError("Each line in the master table should have at least 6 elements. A line (below) had %s elements.\n%s" %(len(lineArray), lineArray)) - number, projectID, runID, groupID, seqType, uniqueLabel = lineArray[:6] - categories = lineArray[6:] - self.setNumber(number) - self.setProjectID(projectID) - self.setRunID(runID) - self.setGroupID(groupID) - self.setSeqType(seqType) - self.setUniqueLabel(uniqueLabel) - self.setCategories(categories) - self.identifier = (self.groupID.lower(), self.uniqueLabel.lower()) - self.sampleID = "%s_%s" %(projectID, number) - self.groupID = "%s_%s" %(groupID, seqType) - self.read1 = "%s_R1.fastq.gz" %self.sampleID - self.read2 = "%s_R2.fastq.gz" %self.sampleID - self.errorModelGroup = self.getErrorModelGroup() - self.createdFiles = {} - self.baseName, self.read1Base, self.read2Base = self.getBaseNames() - if not self.uniqueLabel: - self.sampleLabel = self.sampleID - else: - self.sampleLabel = self.uniqueLabel - - def getBaseNames(self): - read1Base = self.read1.split(".")[0] - read2Base = self.read2.split(".")[0] - base = "_".join(read1Base.split("_")[:2]) - return base, read1Base, read2Base - - def setNumber(self, sampleName): - self.sampleName = validations.naming.alphaNumericString(sampleName.strip(), replacement="") - - def setProjectID(self, projectID:str): - self.projectID = validations.naming.alphaNumericString(projectID.strip(), replacement="") - - def setRunID(self, runID:str): - import datetime - runID = runID.strip() - self.runID = validations.naming.alphaNumericString(runID, replacement="") - rawDate = runID[-6:] - if not rawDate.isdigit(): - self.runDate = None - day = int(rawDate[0:2]) - month = int(rawDate[2:4]) - year = int(rawDate[4:6]) - self.runDate = datetime.date(year, month, day) - - def setGroupID(self, groupID:str): - self.groupID = validations.naming.alphaNumericString(groupID, replacement="") - - def setSeqType(self, seqType:str): - self.seqType = seqType.strip() - - def setUniqueLabel(self, uniqueLabel:str): - self.uniqueLabel = validations.naming.alphaNumericString(uniqueLabel.strip(), replacement="") - - def setCategories(self, categories:list): - self.categories = [] - for category in categories: - if category: - self.categories.append(validations.naming.alphaNumericString(category.strip())) - else: - self.categories.append(None) - - def getErrorModelGroup(self): - return "%s_%s" %(self.runID, self.seqType) - - def addCreatedFile(self, path:str, identifier:str, direction:int = 1): - try: - hashval = hash(identifier) - except TypeError: - raise ValueError("Identifier value must be hashable. %s of type %s was given, which is not." %(identifier, type(identifier))) - if not direction in [1, 2]: - print("WARNING: Direction was given as something other than 1 or 2.") - if not identifier in self.createdFiles: - self.createdFiles[identifier] = {} - self.createdFiles[identifier][direction] = path - - def __str__(self): - identifier = "%s:%s" %(self.sampleID, self.seqType) - return identifier \ No newline at end of file diff --git a/miqScoreShotgunPublicSupport/projectData/microbiome/sixteenS/metadata/pipelineParameters.py b/miqScoreShotgunPublicSupport/projectData/microbiome/sixteenS/metadata/pipelineParameters.py deleted file mode 100644 index 633f93d..0000000 --- a/miqScoreShotgunPublicSupport/projectData/microbiome/sixteenS/metadata/pipelineParameters.py +++ /dev/null @@ -1,179 +0,0 @@ -from .... import generics -from ....utilities import validations - -parameterKeys = ("fwdPrimerLength", - "revPrimerLength", - "fwdReadLengthWithPrimer", - "revReadLengthWithPrimer", - "minSeqSize", - "refSeq", - "refTaxa", - "refTree", - "refAlign", - "rdpDatabase") - -class PipelineParameters(generics.InputFile): - - def parseInputFile(self): - if self.filePath.lower().endswith(".json"): - self.loadJSON() - elif self.filePath.lower().endswith(".csv"): - self.loadCSV() - else: - file = open(self.filePath, 'r') - firstCharacter = file.read(1) - file.close() - if firstCharacter == "{": - self.loadJSON() - else: - self.loadCSV() - self.lookupTable = {} - self.makeLookupTable() - - def loadJSON(self): - import json - self.parameterTable = {} - file = open(self.filePath, 'r') - self.rawParameters = json.load(file) - file.close() - for pipeline in self.rawParameters: - for key in parameterKeys: - if not key in self.parameterTable[pipeline]: - raise KeyError("Unable to find the %s parameter for the %s pipeline." %(key, pipeline)) - for pipelineName in self.rawParameters: - pipeline = self.rawParameters[pipelineName] - self.parameterTable[pipelineName] = PipelineParameterSet(pipelineName, pipeline["fwdPrimerLength"], pipeline["revPrimerLength"], pipeline["fwdReadlengthWithPrimer"], pipeline["revReadLengthWithPrimer"], pipeline["minSeqSize"], pipeline["refSeq"], pipeline["refTaxa"], pipeline["refTree"], pipeline["refAlign"], pipeline["rdpDatabase"]) - - def loadCSV(self): - import csv - self.parameterTable = {} - file = open(self.filePath, 'r') - csvHandle = csv.reader(file) - for line in csvHandle: - if line[0].startswith("#"): - continue - else: - if not len(line) == 11: - raise ValueError("Pipeline parameter dataline length not equal to 11. %s items found.\n%s" %(len(line), line)) - seqType, fwdPrimerLength, revPrimerLength, fwdReadLengthWithPrimer, revReadLengthWithPrimer, minSeqSize, refSeq, refTaxa, refTree, refAlign, rdpDatabase = line - fwdPrimerLength = int(fwdPrimerLength) - revPrimerLength = int(revPrimerLength) - revReadLengthWithPrimer = int(revReadLengthWithPrimer) - fwdReadLengthWithPrimer = int(fwdReadLengthWithPrimer) - minSeqSize = int(minSeqSize) - if seqType in self.parameterTable: - raise KeyError("Duplicate sequence type found: %s" %seqType) - parameterSet = PipelineParameterSet(seqType, fwdPrimerLength, revPrimerLength, fwdReadLengthWithPrimer, revReadLengthWithPrimer, minSeqSize, refSeq, refTaxa, refTree, refAlign, rdpDatabase) - self.parameterTable[seqType] = parameterSet - file.close() - - def saveJSON(self, outputFilePath:str): - import json - outputData = {} - for pipelineName in self.parameterTable: - outputData[pipelineName] = self.parameterTable[pipelineName].makeDict() - outputFile = open(outputFilePath, 'w') - json.dump(outputData, outputFile, indent = 4) - outputFile.close() - - def makeLookupTable(self): - for key in self.parameterTable: - if key in self.lookupTable: - raise KeyError("Error: Duplicate pipeline keys found: %s" %key) - if key.lower() in self.lookupTable: - raise KeyError("Error: Casing collision found for key %s" %key) - self.lookupTable[key] = key - self.lookupTable[key.lower()] = key - - def __getitem__(self, item): - if not item.lower() in self.lookupTable: - raise KeyError("Unable to find a pipeline parameter set called " %item) - return self.parameterTable[self.lookupTable[item.lower()]] - - def __str__(self): - return str(self.parameterTable) - - def __contains__(self, item): - return item in self.lookupTable - - -class PipelineParameterSet(object): - - def __init__(self, name:str, fwdPrimerLength:int, revPrimerLength:int, fwdReadLengthWithPrimer:int, revReadLengthWithPrimer:int, minSeqSize:int, refSeq:str, refTaxa:str, refTree:str, refAlign:str, rdpDatabase:str): - self.name = name - self.fwdPrimerLength = fwdPrimerLength - self.revPrimerLength = revPrimerLength - self.fwdReadLengthWithPrimer = fwdReadLengthWithPrimer - self.revReadLengthWithPrimer = revReadLengthWithPrimer - self.minSeqSize = minSeqSize - self.refSeq = refSeq - self.refTaxa = refTaxa - self.refTree = refTree - self.refAlign = refAlign - self.rdpDatabase = rdpDatabase - self.performValidations() - - def makeDict(self): - dictionary = {"fwdPrimerLength": self.fwdPrimerLength, - "revPrimerLength": self.revPrimerLength, - "fwdReadLengthWithPrimer": self.fwdReadLengthWithPrimer, - "revReadLengthWithPrimer": self.revReadLengthWithPrimer, - "minSeqSize": self.minSeqSize, - "refSeq": self.refSeq, - "refTaxa": self.refTaxa, - "refTree": self.refTree, - "refAlign": self.refAlign, - "rdpDatabase": self.rdpDatabase} - return dictionary - - def performValidations(self): - assert validations.numerical.isPositiveInteger(self.fwdPrimerLength), "Error. Forward primer length must be a positive integer. %s was given." %self.fwdPrimerLength - self.fwdPrimerLength = int(self.fwdPrimerLength) - assert validations.numerical.isPositiveInteger(self.revPrimerLength), "Error. Reverse primer length must be a positive integer. %s was given." %self.revPrimerLength - self.revPrimerLength = int(self.revPrimerLength) - assert self.fwdReadLengthWithPrimer > self.fwdPrimerLength, "Error. Forward read length with primer should always be longer than the primer itself. Primer length: %s, forward read length with primer: %s" %(self.fwdPrimerLength, self.fwdReadLengthWithPrimer) - self.fwdReadLengthWithPrimer = int(self.fwdReadLengthWithPrimer) - assert self.revReadLengthWithPrimer > self.revPrimerLength, "Error. Reverse read length with primer should always be longer than the primer itself. Primer length: %s, reverse read length with primer: %s" %(self.revPrimerLength, self.revReadLengthWithPrimer) - self.revReadLengthWithPrimer = int(self.revReadLengthWithPrimer) - assert validations.numerical.isPositiveInteger(self.minSeqSize), "Error. Minimum sequence size must be a positive integer. %s was given." %self.minSeqSize - self.minSeqSize = int(self.minSeqSize) - # KILLING THESE VALIDATIONS, THIS NEEDS TO HAPPEN OUTSIDE OF HERE, AS THE FOLDER WILL BE UNKNOWN BY THIS FUNCTION - # if self.refSeq: - # refSeq = validations.system.fileExistsAndAbsolutePath(self.refSeq) - # if not refSeq: - # pass #raise FileNotFoundError("Unable to find reference sequence file at %s" %self.refSeq) - # self.refSeq = refSeq - # else: - # self.refSeq = None - # if self.refTaxa: - # refTaxa = validations.system.fileExistsAndAbsolutePath(self.refTaxa) - # if not refTaxa: - # pass #raise FileNotFoundError("Unable to find reference taxa file at %s" %self.refTaxa) - # self.refTaxa = refTaxa - # else: - # self.refTaxa = None - # if self.refTree: - # refTree = validations.system.fileExistsAndAbsolutePath(self.refTree) - # if not refTree: - # pass #raise FileNotFoundError("Unable to find reference tree file at %s" %self.refTree) - # self.refTree = refTree - # else: - # self.refTree = None - # if self.refAlign: - # refAlign = validations.system.fileExistsAndAbsolutePath(self.refAlign) - # if not refAlign: - # pass #raise FileNotFoundError("Unable to find reference alignment file at %s" %self.refAlign) - # self.refAlign = refAlign - # else: - # self.refAlign = None - # if self.rdpDatabase: - # rdpDatabase = validations.system.fileExistsAndAbsolutePath(self.rdpDatabase) - # if not rdpDatabase: - # pass #raise FileNotFoundError("Unable to find RDP database file at %s" %self.rdpDatabase) - # self.rdpDatabase = rdpDatabase - # else: - # self.rdpDatabase = None - - def __str__(self): - return str(self.makeDict()) - diff --git a/miqScoreShotgunPublicSupport/projectData/utilities/__init__.py b/miqScoreShotgunPublicSupport/projectData/utilities/__init__.py deleted file mode 100644 index af32b47..0000000 --- a/miqScoreShotgunPublicSupport/projectData/utilities/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -__all__ = ["validations"] - -from . import validations \ No newline at end of file diff --git a/miqScoreShotgunPublicSupport/projectData/utilities/validations/__init__.py b/miqScoreShotgunPublicSupport/projectData/utilities/validations/__init__.py deleted file mode 100644 index e2c6d50..0000000 --- a/miqScoreShotgunPublicSupport/projectData/utilities/validations/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -__all__ = ["fileIntegrity", - "naming", - "numerical", - "system"] - -from . import fileIntegrity -from . import naming -from . import numerical -from . import system diff --git a/miqScoreShotgunPublicSupport/projectData/utilities/validations/fileIntegrity.py b/miqScoreShotgunPublicSupport/projectData/utilities/validations/fileIntegrity.py deleted file mode 100644 index 1133b98..0000000 --- a/miqScoreShotgunPublicSupport/projectData/utilities/validations/fileIntegrity.py +++ /dev/null @@ -1,31 +0,0 @@ -import os -import logging -logger = logging.getLogger(__name__) - -def md5File(path:str, dontWarnOnEmptyFile = False): - import hashlib - path = os.path.abspath(path) - if not os.path.isfile(path): - logger.critical("Unable to find file for hashing at %s" %path) - raise FileNotFoundError("File not found: %s" %path) - md5 = hashlib.md5() - with open(path, 'rb') as file: - for chunk in iter(lambda: file.read(4096), b""): - md5.update(chunk) - digest = md5.hexdigest() - if digest == "d41d8cd98f00b204e9800998ecf8427e" and not dontWarnOnEmptyFile: - logger.warning("MD5 value d41d8cd98f00b204e9800998ecf8427e is the hash of an empty file. Someone should probably review this. File path: %s" %path) - logger.info("MD5 checksum for %s was %s" %(path, digest)) - return digest - -def getFileSize(path:str): - if not os.path.isfile(path): - return None - size = os.path.getsize(path) - return size - -def logFileInfo(path:str): - md5 = md5File(path) - size = getFileSize(path) - logger.info("File integrity info for %s: MD5=%s SIZE=%s" %(path, md5, size)) - return (md5, size) \ No newline at end of file diff --git a/miqScoreShotgunPublicSupport/projectData/utilities/validations/naming.py b/miqScoreShotgunPublicSupport/projectData/utilities/validations/naming.py deleted file mode 100644 index ba18f92..0000000 --- a/miqScoreShotgunPublicSupport/projectData/utilities/validations/naming.py +++ /dev/null @@ -1,30 +0,0 @@ -defaultAllowableUniqueIDCharacters = "." - -def isAlphaNumeric(character:str, allowedCharacters:str = defaultAllowableUniqueIDCharacters, replacement = None): - if not type(character) == str: - return False - if not character: - return False - if len(character) > 1: - return alphaNumericString(character, allowedCharacters, replacement) - if allowedCharacters: - if character in allowedCharacters: - return True - if character.isalnum(): - return True - return False - - -def alphaNumericString(string:str, allowedCharacters:str = defaultAllowableUniqueIDCharacters, replacement = None): - if not string: - return False - validatedString = "" - for character in string: - if not isAlphaNumeric(character, allowedCharacters): - if replacement != None: - validatedString += replacement - else: - return False - else: - validatedString += character - return validatedString diff --git a/miqScoreShotgunPublicSupport/projectData/utilities/validations/numerical.py b/miqScoreShotgunPublicSupport/projectData/utilities/validations/numerical.py deleted file mode 100644 index 806b902..0000000 --- a/miqScoreShotgunPublicSupport/projectData/utilities/validations/numerical.py +++ /dev/null @@ -1,31 +0,0 @@ -#Numerical Validations - -def isInteger(value): - return type(value) == int - - -def isNumber(value): - return type(value) in [int, float] - - -def isPositive(value): - if isNumber(value): - return value > 0 - else: - return False - - -def isNotNegative(value): - if isNumber(value): - return value >= 0 - else: - return False - - -def isPositiveInteger(value): - return type(value) == int and isPositive(value) - - -def isNonNegativeInteger(value): - return type(value) == int and isNotNegative(value) - diff --git a/miqScoreShotgunPublicSupport/projectData/utilities/validations/system.py b/miqScoreShotgunPublicSupport/projectData/utilities/validations/system.py deleted file mode 100644 index e7cba24..0000000 --- a/miqScoreShotgunPublicSupport/projectData/utilities/validations/system.py +++ /dev/null @@ -1,13 +0,0 @@ -def fileExistsAndAbsolutePath(fileName): - import os - if not os.path.isfile(fileName): - return False - else: - return os.path.abspath(fileName) - -def directoryExistsAndAbsolutePath(directory): - import os - if not os.path.isdir(directory): - return False - else: - return os.path.abspath(directory) \ No newline at end of file diff --git a/miqScoreShotgunPublicSupport/reporting/__init__.py b/miqScoreShotgunPublicSupport/reporting/__init__.py deleted file mode 100644 index 61cba92..0000000 --- a/miqScoreShotgunPublicSupport/reporting/__init__.py +++ /dev/null @@ -1,50 +0,0 @@ -def sumDictionary(dictionary:dict): - sum = 0 - for key in dictionary: - if key: - sum += dictionary[key] - return sum - - -def generateReadFateChartBody(readFateTable:dict, readFatePrintNames:dict=None): - printReadFateTable = {} - if readFatePrintNames: - for readFate in readFateTable: - if readFate in readFatePrintNames: - printReadFateTable[readFatePrintNames[readFate]] = readFateTable[readFate] - else: - printReadFateTable[readFate] = readFateTable[readFate] - else: - printReadFateTable = readFateTable.copy() - outputTable = "" - for fate in printReadFateTable: - outputTable += '\ - \n\ - %s\n\ - %s\n\ - \ - ' %(fate, round(printReadFateTable[fate], 2)) - return outputTable - - -def generateAbsoluteReadFateCounts(miqScore): - referenceCounts = sumDictionary(miqScore.referenceReadCounts) - absoluteReadFates = miqScore.nonreferenceReadCounts.copy() - absoluteReadFates["Reference"] = referenceCounts - return absoluteReadFates - - -def generateReplacementTable(sampleMiq, goodExampleMiq, badExampleMiq, readFatePrintNames:dict=None): - readFateTable = generateReadFateChartBody(generateAbsoluteReadFateCounts(sampleMiq), readFatePrintNames) - replacementTable = {"SAMPLENAME": sampleMiq.sampleID, - "MIQSCORE": str(round(sampleMiq.miqScore)), - "READFATETABLE": readFateTable, - "READFATECHART": sampleMiq.plots["readFates"], - "COMPOSITIONBARPLOT": sampleMiq.plots["compositionPlot"], - "GOODRADARPLOTLYSIS": goodExampleMiq.plots["radarPlots"]["Lysis Difficulty"], - "SAMPLERADARPLOTLYSIS": sampleMiq.plots["radarPlots"]["Lysis Difficulty"], - "BADRADARPLOTLYSIS": badExampleMiq.plots["radarPlots"]["Lysis Difficulty"], - "GOODRADARPLOTGC": goodExampleMiq.plots["radarPlots"]["GC Content"], - "SAMPLERADARPLOTGC": sampleMiq.plots["radarPlots"]["GC Content"], - "BADRADARPLOTGC": badExampleMiq.plots["radarPlots"]["GC Content"]} - return replacementTable \ No newline at end of file