Add shallow statement in report

refactor validator.py
EBIvariation · Sep 1, 2024 · facfd73 · facfd73
1 parent ac54c2f
commit facfd73
Show file tree

Hide file tree

Showing 15 changed files with 423 additions and 268 deletions.
diff --git a/eva_sub_cli/executables/trim_down.py b/eva_sub_cli/executables/trim_down.py
@@ -1,6 +1,7 @@
 import argparse
 import os
 
+import yaml
 from ebi_eva_common_pyutils.logger import logging_config
 from eva_sub_cli.file_utils import open_gzip_if_required, fasta_iter
 
@@ -12,7 +13,7 @@
 
 def trim_down_vcf(vcf_file, output_vcf):
     """
-    Produce a smaller file
+    Produce a smaller vcf files containing a maximum of 10000 records
     """
     with open_gzip_if_required(vcf_file) as vcf_in, open(output_vcf, 'w') as vcf_out:
         line_count = 0
@@ -27,10 +28,13 @@ def trim_down_vcf(vcf_file, output_vcf):
                 break
     if line_count != max_nb_lines:
         logger.warning(f'Only {line_count} found in the source VCF {vcf_file} ')
-    return ref_seq_names
+    return line_count, ref_seq_names
 
 
 def trim_down_fasta(fasta_file, output_fasta, ref_seq_names):
+    """
+    Produce a smaller fasta files containing only the reference sequences found in the VCF file
+    """
     found_sequences = set()
     with open(output_fasta, 'w') as fasta_out:
         for header, sequence in fasta_iter(fasta_file):
@@ -47,16 +51,31 @@ def main():
     arg_parser = argparse.ArgumentParser(
         description=f'Take a VCF file and only keep {max_nb_lines} lines and remove unused fasta sequence from the '
                     f'associated reference genome')
-    arg_parser.add_argument('--vcf_file', dest='vcf_file', help='Path to the vcf file to be trimmed down')
-    arg_parser.add_argument('--output_vcf_file', dest='output_vcf_file', help='Path to the output vcf file')
-    arg_parser.add_argument('--fasta_file', dest='fasta_file', help='Path to the fasta file to be trimmed down')
-    arg_parser.add_argument('--output_fasta_file', dest='output_fasta_file', help='Path to the output fasta file')
+    arg_parser.add_argument('--vcf_file', dest='vcf_file', required=True,
+                            help='Path to the vcf file to be trimmed down')
+    arg_parser.add_argument('--output_vcf_file', dest='output_vcf_file', required=True,
+                            help='Path to the output vcf file')
+    arg_parser.add_argument('--fasta_file', dest='fasta_file', required=True,
+                            help='Path to the fasta file to be trimmed down')
+    arg_parser.add_argument('--output_fasta_file', dest='output_fasta_file', required=True,
+                            help='Path to the output fasta file')
+    arg_parser.add_argument('--output_yaml_file', dest='output_yaml_file', required=True,
+                            help='Path to the yaml file containing the trim down metrics')
 
     args = arg_parser.parse_args()
     logging_config.add_stdout_handler()
 
-    ref_sequence = trim_down_vcf(args.vcf_file, args.output_vcf_file)
+    line_count, ref_sequence = trim_down_vcf(args.vcf_file, args.output_vcf_file)
     sequence_found = trim_down_fasta(args.fasta_file, args.output_fasta_file, ref_sequence)
+    trim_down_metrics = {'trim_down_vcf_record': line_count, 'number_sequence_found': sequence_found,
+                         'trim_down_required': line_count == max_nb_lines}
     if len(sequence_found) != len(ref_sequence):
         logger.warning(f'Not all sequences were found in the fasta file. Cancelling trimming down of fasta file')
         os.link(args.fasta_file, args.output_fasta_file)
+        trim_down_metrics.pop('number_sequence_found')
+    with open(args.output_yaml_file) as open_file:
+        yaml.safe_dump(trim_down_metrics, open_file)
+
+
+
+
diff --git a/eva_sub_cli/file_utils.py b/eva_sub_cli/file_utils.py
@@ -1,9 +1,18 @@
+import glob
 import gzip
 import os
 import shutil
 from itertools import groupby
 
 
+def resolve_single_file_path(file_path):
+    files = glob.glob(file_path)
+    if len(files) == 0:
+        return None
+    elif len(files) > 0:
+        return files[0]
+
+
 def is_submission_dir_writable(submission_dir):
     if not os.path.exists(submission_dir):
         os.makedirs(submission_dir)

diff --git a/eva_sub_cli/jinja_templates/html_report.html b/eva_sub_cli/jinja_templates/html_report.html
@@ -4,6 +4,7 @@
 {% from 'sample_name_check.html' import sample_name_check_report %}
 {% from 'fasta_check.html' import fasta_check_report %}
 {% from 'metadata_validation.html' import metadata_validation_report %}
+{% from 'shallow_validation.html' import shallow_validation_report %}
 
 <html lang="EN">
 <head>
@@ -46,6 +47,10 @@ <h6>eva-sub-cli v{{cli_version}}</h6>
     </div>
 </header>
 
+<section>
+    {{ shallow_validation_report(validation_results) }}
+</section>
+
 <section>
     <h2>Project Summary</h2>
     <div class="description">

diff --git a/eva_sub_cli/jinja_templates/project_details.html b/eva_sub_cli/jinja_templates/project_details.html
@@ -32,4 +32,6 @@
         </div>
     {% endif %}
 
+
+
 {%- endmacro %}
diff --git a/eva_sub_cli/jinja_templates/shallow_validation.html b/eva_sub_cli/jinja_templates/shallow_validation.html
@@ -0,0 +1,27 @@
+
+{% macro shallow_validation_report(validation_results) -%}
+    {% set results = validation_results.get('shallow_validation', {}) %}
+
+    {% if results.get('required') %}
+        <div class="report-section fail collapsible"> <span class="expand_icon">&#9654;</span>
+            &#10060; <b>You requested to run the shallow validation, please run full validation before submitting the data</b>
+        </div>
+        <div class="no-show">
+        <table>
+            <tr>
+                <th><strong>VCF File</strong></th>
+                <th><strong>Records validated in VCF</strong></th>
+                <th><strong>Records validated in Fasta</strong></th>
+            </tr>
+            {% for vcf_file in results.get('metrics') %}
+                <tr>
+                    <td>{{ vcf_file }}</td>
+                    <td>{{ results.get('metrics').get(vcf_file).get('trim_down_vcf_record') }}</td>
+                    <td>{{ results.get('metrics').get(vcf_file).get('number_sequence_found') }}</td>
+                </tr>
+                {% endfor %}
+            </table>
+        </div>
+    {% endif %}
+
+{%- endmacro %}
diff --git a/eva_sub_cli/nextflow/validation.nf b/eva_sub_cli/nextflow/validation.nf
@@ -105,19 +105,21 @@ workflow {
 
 
 process trim_down_vcf {
-    publishDir output_dir,
-            overwrite: false,
-            mode: "copy",
-            pattern: "*.log"
+    publishDir output_dir, overwrite: false, mode: "copy", pattern: "*.log"
+    publishDir output_dir, overwrite: false, mode: "copy", pattern: "*.yml"
+
     input:
     tuple path(vcf), path(fasta), path(report)
 
     output:
     tuple path("output/$vcf"), path("output/$fasta"), path(report), emit: vcf_and_ref
+    path "${vcf.getBaseName()}_trim_down.log", emit: trim_down_log
+    path "${vcf.getBaseName()}_trim_down.yml", emit: trim_down_metric
 
     """
     mkdir output
-    $params.python_scripts.trim_down --vcf_file $vcf  --output_vcf_file output/$vcf --fasta_file $fasta --output_fasta_file output/$fasta > trim_down.log
+    $params.python_scripts.trim_down --vcf_file $vcf  --output_vcf_file output/$vcf --fasta_file $fasta --output_fasta_file output/$fasta --output_yaml_file ${vcf.getBaseName()}_trim_down.yml > ${vcf.getBaseName()}_trim_down.log
+    # This is needed to ensure that a missing (NO_FILE) report can still be passed down to subsequent steps
     touch $report
     """
 

diff --git a/eva_sub_cli/report.py b/eva_sub_cli/report.py
@@ -14,7 +14,8 @@ def get_logo_data():
         return logo_data
 
 
-def generate_html_report(validation_results, validation_date, submission_dir, vcf_fasta_analysis_mapping, project_title=None):
+def generate_html_report(validation_results, validation_date, submission_dir, vcf_fasta_analysis_mapping,
+                         project_title=None):
     vcf_files = sorted(set([file_name
                             for check in validation_results if check in ["vcf_check", "assembly_check"]
                             for file_name in validation_results[check]
@@ -32,7 +33,7 @@ def generate_html_report(validation_results, validation_date, submission_dir, vc
         fasta_files=fasta_files,
         submission_dir=submission_dir,
         vcf_fasta_analysis_mapping=vcf_fasta_analysis_mapping,
-        validation_results=validation_results,
+        validation_results=validation_results
     )
 
     try:

diff --git a/eva_sub_cli/validators/validation_results_parsers.py b/eva_sub_cli/validators/validation_results_parsers.py
@@ -0,0 +1,191 @@
+import re
+
+from ebi_eva_common_pyutils.logger import logging_config
+
+logger = logging_config.get_logger(__name__)
+
+def parse_assembly_check_log(assembly_check_log):
+    error_list = []
+    nb_error, nb_mismatch = 0, 0
+    match = total = None
+    with open(assembly_check_log) as open_file:
+        for line in open_file:
+            if line.startswith('[error]'):
+                nb_error += 1
+                if nb_error < 11:
+                    error_list.append(line.strip()[len('[error] '):])
+            elif line.startswith('[info] Number of matches:'):
+                match, total = line.strip()[len('[info] Number of matches: '):].split('/')
+                match = int(match)
+                total = int(total)
+    return error_list, nb_error, match, total
+
+
+def parse_assembly_check_report(assembly_check_report):
+    mismatch_list = []
+    nb_mismatch = 0
+    nb_error = 0
+    error_list = []
+    with open(assembly_check_report) as open_file:
+        for line in open_file:
+            if 'does not match the reference sequence' in line:
+                nb_mismatch += 1
+                if nb_mismatch < 11:
+                    mismatch_list.append(line.strip())
+            elif 'Multiple synonyms' in line:
+                nb_error += 1
+                if nb_error < 11:
+                    error_list.append(line.strip())
+            # Contig not found in FASTA is reported here rather than in logs when no assembly report is used.
+            # Count and report once per contig name rather than once per line, to avoid redundant errors.
+            elif 'is not present in FASTA file' in line:
+                line_num, error_msg = line.split(': ')
+                error_msg = error_msg.strip()
+                if error_msg not in error_list:
+                    nb_error += 1
+                    if nb_error < 11:
+                        error_list.append(error_msg)
+    return mismatch_list, nb_mismatch, error_list, nb_error
+
+
+def parse_vcf_check_report(vcf_check_report):
+    valid = True
+    max_error_reported = 10
+    error_list, critical_list = [], []
+    warning_count = error_count = critical_count = 0
+    with open(vcf_check_report) as open_file:
+        for line in open_file:
+            if 'warning' in line:
+                warning_count = 1
+            elif line.startswith('According to the VCF specification'):
+                if 'not' in line:
+                    valid = False
+            elif vcf_check_errors_is_critical(line.strip()):
+                critical_count += 1
+                if critical_count <= max_error_reported:
+                    critical_list.append(line.strip())
+            else:
+                error_count += 1
+                if error_count <= max_error_reported:
+                    error_list.append(line.strip())
+
+    return valid, warning_count, error_count, critical_count, error_list, critical_list
+
+
+def vcf_check_errors_is_critical(error):
+    """
+    This function identify VCF check errors that are not critical for the processing of the VCF within EVA.
+    They affect specific INFO or FORMAT fields that are used in the variant detection but less so in the downstream
+    analysis.
+    Critical:
+    Reference and alternate alleles must not be the same.
+    Requested evidence presence with --require-evidence. Please provide genotypes (GT field in FORMAT and samples),
+    or allele frequencies (AF field in INFO), or allele counts (AC and AN fields in INFO)..
+    Contig is not sorted by position. Contig chr10 position 41695506 found after 41883113.
+    Duplicated variant chr1A:1106203:A>G found.
+    Metadata description string is not valid.
+
+    Error
+    Sample #10, field PL does not match the meta specification Number=G (expected 2 value(s)). PL=.. It must derive
+    its number of values from the ploidy of GT (if present), or assume diploidy. Contains 1 value(s), expected 2
+    (derived from ploidy 1).
+    Sample #102, field AD does not match the meta specification Number=R (expected 3 value(s)). AD=..
+    """
+    non_critical_format_fields = ['PL', 'AD', 'AC']
+    non_critical_info_fields = ['AC']
+    regexes = {
+        r'^INFO (\w+) does not match the specification Number': non_critical_format_fields,
+        r'^Sample #\d+, field (\w+) does not match the meta specification Number=': non_critical_info_fields
+    }
+    for regex in regexes:
+        match = re.match(regex, error)
+        if match:
+            field_affected = match.group(1)
+            if field_affected in regexes[regex]:
+                return False
+    return True
+
+
+def parse_biovalidator_validation_results(metadata_check_file):
+    """
+    Read the biovalidator's report and extract the list of validation errors
+    """
+    ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
+
+    def clean_read(ifile):
+        l = ifile.readline()
+        if l:
+            return ansi_escape.sub('', l).strip()
+
+    if not metadata_check_file:
+        return
+
+    errors = []
+
+    with open(metadata_check_file) as open_file:
+        collect = False
+        while True:
+            line = clean_read(open_file)
+            if line is None:
+                break  # EOF
+            elif not line:
+                continue  # Empty line
+            if not collect:
+                if line.startswith('Validation failed with following error(s):'):
+                    collect = True
+            else:
+                line2 = clean_read(open_file)
+                if line is None or line2 is None:
+                    break  # EOF
+                errors.append({'property': line, 'description': line2})
+    return errors
+
+
+def convert_metadata_sheet(json_attribute, xls2json_conf):
+    if json_attribute is None:
+        return None
+    for sheet_name in xls2json_conf['worksheets']:
+        if xls2json_conf['worksheets'][sheet_name] == json_attribute:
+            return sheet_name
+
+
+def convert_metadata_row(sheet, json_row, xls2json_conf):
+    if json_row is None:
+        return ''
+    if 'header_row' in xls2json_conf[sheet]:
+        return int(json_row) + xls2json_conf[sheet]['header_row']
+    else:
+        return int(json_row) + 2
+
+
+def convert_metadata_attribute(sheet, json_attribute, xls2json_conf):
+    if json_attribute is None:
+        return ''
+    attributes_dict = {}
+    attributes_dict.update(xls2json_conf[sheet].get('required', {}))
+    attributes_dict.update(xls2json_conf[sheet].get('optional', {}))
+    for attribute in attributes_dict:
+        if attributes_dict[attribute] == json_attribute:
+            return attribute
+
+
+def parse_metadata_property(property_str):
+    if property_str.startswith('.'):
+        return property_str.strip('./'), None, None
+    # First attempt to parse as BioSample object
+    sheet, row, col = parse_sample_metadata_property(property_str)
+    if sheet is not None and row is not None and col is not None:
+        return sheet, row, col
+    match = re.match(r'/(\w+)(/(\d+))?([./](\w+))?', property_str)
+    if match:
+        return match.group(1), match.group(3), match.group(5)
+    else:
+        logger.error(f'Cannot parse {property_str} in JSON metadata error')
+        return None, None, None
+
+
+def parse_sample_metadata_property(property_str):
+    match = re.match(r'/sample/(\d+)/bioSampleObject/characteristics/(\w+)', property_str)
+    if match:
+        return 'sample', match.group(1), match.group(2)
+    return None, None, None