From 3ab4ba620c5a4cb09743a218a544901e8d78ae7e Mon Sep 17 00:00:00 2001 From: tcezard Date: Thu, 28 Sep 2023 21:38:45 +0100 Subject: [PATCH] Generate tsv files at the end of the counting process --- .../gather_release_counts.py | 52 +++++++++++++++++-- 1 file changed, 47 insertions(+), 5 deletions(-) diff --git a/eva-accession-release-automation/gather_clustering_counts/gather_release_counts.py b/eva-accession-release-automation/gather_clustering_counts/gather_release_counts.py index cea8e57cc..c068bd310 100644 --- a/eva-accession-release-automation/gather_clustering_counts/gather_release_counts.py +++ b/eva-accession-release-automation/gather_clustering_counts/gather_release_counts.py @@ -1,9 +1,12 @@ import argparse import glob import os -from collections import defaultdict +from collections import defaultdict, Counter from ebi_eva_common_pyutils.command_utils import run_command_with_output +from ebi_eva_common_pyutils.logger import logging_config + +logger = logging_config.get_logger(__name__) shell_script_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'bash') @@ -38,11 +41,15 @@ def find_link(key_set, dict1, dict2, source_linked_set1=None, source_linked_set2 def gather_assemblies_and_species_from_directories(release_directory): all_assemblies_2_species = defaultdict(list) all_species_2_assemblies = defaultdict(list) - for assembly_dir in glob.glob(os.path.join(release_directory, '*', "GCA_*")): + for assembly_dir in glob.glob(os.path.join(release_directory, '*', "GCA_*")): assembly_accession = os.path.basename(assembly_dir) species_name = os.path.basename(os.path.dirname(assembly_dir)) all_assemblies_2_species[assembly_accession].append(species_name) all_species_2_assemblies[species_name].append(assembly_accession) + for unmapped_file in glob.glob(os.path.join(release_directory, '*', "*unmapped_ids.txt.gz")): + species_name = os.path.basename(os.path.dirname(unmapped_file)) + if species_name not in all_species_2_assemblies: + all_species_2_assemblies[species_name] = [] return all_assemblies_2_species, all_species_2_assemblies @@ -76,10 +83,35 @@ def collect_files_to_count(release_directory, set_of_species): txt_files = [f for f in txt_files if 'dbsnp_' not in f and 'eva_' not in f] all_files.extend(vcf_files) all_files.extend(txt_files) - all_files.extend(glob.glob(os.path.join(species_dir,'*_unmapped_ids.txt.gz'))) + all_files.extend(glob.glob(os.path.join(species_dir, '*_unmapped_ids.txt.gz'))) return all_files +def parse_logs(all_logs): + species_counts = defaultdict(Counter) + assembly_counts = defaultdict(Counter) + species_assembly_counts = defaultdict(Counter) + for logs in all_logs: + with open(logs) as open_file: + for line in open_file: + sp_line = line.strip().split() + count = int(sp_line[0]) + set_of_annotations = sp_line[1].split(',')[:-1] + for annotation in set_of_annotations: + assembly, sc_name, idtype = annotation.split('-') + species_counts[sc_name][idtype] += count + assembly_counts[assembly][idtype] += count + species_assembly_counts[sc_name + '\t' + assembly][idtype] += count + return species_counts, assembly_counts, species_assembly_counts + + +def generate_output_tsv(dict_of_counter, output_file): + with open(output_file, 'w') as open_file: + for annotation1 in dict_of_counter: + for annotation2 in dict_of_counter[annotation1]: + open_file.write("\t".join([annotation1, annotation2, str(dict_of_counter[annotation1][annotation2])])) + + def main(): parser = argparse.ArgumentParser( description='Parse all the release output to get RS statistics per species') @@ -94,18 +126,28 @@ def main(): "Process all if missing") args = parser.parse_args() + + logging_config.add_stdout_handler() + logger.info(f'Analyse {args.release_root_path}') all_assemblies_2_species, all_species_2_assemblies = gather_assemblies_and_species_from_directories(args.release_root_path) all_sets_of_species = set() # Determine the species that needs to be counted together because they share assemblies species_to_search = args.species_directories if not species_to_search: species_to_search = all_species_2_assemblies.keys() + logger.info(f'Process {len(species_to_search)} species') for species in species_to_search: set_of_species, set_of_assemblies = find_link({species}, all_species_2_assemblies, all_assemblies_2_species) all_sets_of_species.add(set_of_species) - + logger.info(f'Aggregate species in {len(all_sets_of_species)} groups') + all_logs = [] for set_of_species in all_sets_of_species: - count_log = gather_count_for_set_species(args.release_root_path, set_of_species, args.output_dir) + logger.info(f'Process files for {",".join(set_of_species)} groups') + all_logs.append(gather_count_for_set_species(args.release_root_path, set_of_species, args.output_dir)) + species_counts, assembly_counts, species_assembly_counts = parse_logs(all_logs) + generate_output_tsv(species_counts, os.path.join(args.output_dir, 'species_counts.tsv')) + generate_output_tsv(assembly_counts, os.path.join(args.output_dir, 'assembly_counts.tsv')) + generate_output_tsv(species_assembly_counts, os.path.join(args.output_dir, 'species_assembly_counts.tsv')) if __name__ == "__main__":