Skip to content

Commit

Permalink
Add shallow statement in report
Browse files Browse the repository at this point in the history
refactor validator.py
  • Loading branch information
tcezard committed Sep 1, 2024
1 parent ac54c2f commit facfd73
Show file tree
Hide file tree
Showing 15 changed files with 423 additions and 268 deletions.
33 changes: 26 additions & 7 deletions eva_sub_cli/executables/trim_down.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import argparse
import os

import yaml
from ebi_eva_common_pyutils.logger import logging_config
from eva_sub_cli.file_utils import open_gzip_if_required, fasta_iter

Expand All @@ -12,7 +13,7 @@

def trim_down_vcf(vcf_file, output_vcf):
"""
Produce a smaller file
Produce a smaller vcf files containing a maximum of 10000 records
"""
with open_gzip_if_required(vcf_file) as vcf_in, open(output_vcf, 'w') as vcf_out:
line_count = 0
Expand All @@ -27,10 +28,13 @@ def trim_down_vcf(vcf_file, output_vcf):
break
if line_count != max_nb_lines:
logger.warning(f'Only {line_count} found in the source VCF {vcf_file} ')
return ref_seq_names
return line_count, ref_seq_names


def trim_down_fasta(fasta_file, output_fasta, ref_seq_names):
"""
Produce a smaller fasta files containing only the reference sequences found in the VCF file
"""
found_sequences = set()
with open(output_fasta, 'w') as fasta_out:
for header, sequence in fasta_iter(fasta_file):
Expand All @@ -47,16 +51,31 @@ def main():
arg_parser = argparse.ArgumentParser(
description=f'Take a VCF file and only keep {max_nb_lines} lines and remove unused fasta sequence from the '
f'associated reference genome')
arg_parser.add_argument('--vcf_file', dest='vcf_file', help='Path to the vcf file to be trimmed down')
arg_parser.add_argument('--output_vcf_file', dest='output_vcf_file', help='Path to the output vcf file')
arg_parser.add_argument('--fasta_file', dest='fasta_file', help='Path to the fasta file to be trimmed down')
arg_parser.add_argument('--output_fasta_file', dest='output_fasta_file', help='Path to the output fasta file')
arg_parser.add_argument('--vcf_file', dest='vcf_file', required=True,
help='Path to the vcf file to be trimmed down')
arg_parser.add_argument('--output_vcf_file', dest='output_vcf_file', required=True,
help='Path to the output vcf file')
arg_parser.add_argument('--fasta_file', dest='fasta_file', required=True,
help='Path to the fasta file to be trimmed down')
arg_parser.add_argument('--output_fasta_file', dest='output_fasta_file', required=True,
help='Path to the output fasta file')
arg_parser.add_argument('--output_yaml_file', dest='output_yaml_file', required=True,
help='Path to the yaml file containing the trim down metrics')

args = arg_parser.parse_args()
logging_config.add_stdout_handler()

ref_sequence = trim_down_vcf(args.vcf_file, args.output_vcf_file)
line_count, ref_sequence = trim_down_vcf(args.vcf_file, args.output_vcf_file)
sequence_found = trim_down_fasta(args.fasta_file, args.output_fasta_file, ref_sequence)
trim_down_metrics = {'trim_down_vcf_record': line_count, 'number_sequence_found': sequence_found,
'trim_down_required': line_count == max_nb_lines}
if len(sequence_found) != len(ref_sequence):
logger.warning(f'Not all sequences were found in the fasta file. Cancelling trimming down of fasta file')
os.link(args.fasta_file, args.output_fasta_file)
trim_down_metrics.pop('number_sequence_found')
with open(args.output_yaml_file) as open_file:
yaml.safe_dump(trim_down_metrics, open_file)




9 changes: 9 additions & 0 deletions eva_sub_cli/file_utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
import glob
import gzip
import os
import shutil
from itertools import groupby


def resolve_single_file_path(file_path):
files = glob.glob(file_path)
if len(files) == 0:
return None
elif len(files) > 0:
return files[0]


def is_submission_dir_writable(submission_dir):
if not os.path.exists(submission_dir):
os.makedirs(submission_dir)
Expand Down
5 changes: 5 additions & 0 deletions eva_sub_cli/jinja_templates/html_report.html
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
{% from 'sample_name_check.html' import sample_name_check_report %}
{% from 'fasta_check.html' import fasta_check_report %}
{% from 'metadata_validation.html' import metadata_validation_report %}
{% from 'shallow_validation.html' import shallow_validation_report %}

<html lang="EN">
<head>
Expand Down Expand Up @@ -46,6 +47,10 @@ <h6>eva-sub-cli v{{cli_version}}</h6>
</div>
</header>

<section>
{{ shallow_validation_report(validation_results) }}
</section>

<section>
<h2>Project Summary</h2>
<div class="description">
Expand Down
2 changes: 2 additions & 0 deletions eva_sub_cli/jinja_templates/project_details.html
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,6 @@
</div>
{% endif %}



{%- endmacro %}
27 changes: 27 additions & 0 deletions eva_sub_cli/jinja_templates/shallow_validation.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@

{% macro shallow_validation_report(validation_results) -%}
{% set results = validation_results.get('shallow_validation', {}) %}

{% if results.get('required') %}
<div class="report-section fail collapsible"> <span class="expand_icon">&#9654;</span>
&#10060; <b>You requested to run the shallow validation, please run full validation before submitting the data</b>
</div>
<div class="no-show">
<table>
<tr>
<th><strong>VCF File</strong></th>
<th><strong>Records validated in VCF</strong></th>
<th><strong>Records validated in Fasta</strong></th>
</tr>
{% for vcf_file in results.get('metrics') %}
<tr>
<td>{{ vcf_file }}</td>
<td>{{ results.get('metrics').get(vcf_file).get('trim_down_vcf_record') }}</td>
<td>{{ results.get('metrics').get(vcf_file).get('number_sequence_found') }}</td>
</tr>
{% endfor %}
</table>
</div>
{% endif %}

{%- endmacro %}
12 changes: 7 additions & 5 deletions eva_sub_cli/nextflow/validation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -105,19 +105,21 @@ workflow {


process trim_down_vcf {
publishDir output_dir,
overwrite: false,
mode: "copy",
pattern: "*.log"
publishDir output_dir, overwrite: false, mode: "copy", pattern: "*.log"
publishDir output_dir, overwrite: false, mode: "copy", pattern: "*.yml"

input:
tuple path(vcf), path(fasta), path(report)

output:
tuple path("output/$vcf"), path("output/$fasta"), path(report), emit: vcf_and_ref
path "${vcf.getBaseName()}_trim_down.log", emit: trim_down_log
path "${vcf.getBaseName()}_trim_down.yml", emit: trim_down_metric

"""
mkdir output
$params.python_scripts.trim_down --vcf_file $vcf --output_vcf_file output/$vcf --fasta_file $fasta --output_fasta_file output/$fasta > trim_down.log
$params.python_scripts.trim_down --vcf_file $vcf --output_vcf_file output/$vcf --fasta_file $fasta --output_fasta_file output/$fasta --output_yaml_file ${vcf.getBaseName()}_trim_down.yml > ${vcf.getBaseName()}_trim_down.log
# This is needed to ensure that a missing (NO_FILE) report can still be passed down to subsequent steps
touch $report
"""

Expand Down
5 changes: 3 additions & 2 deletions eva_sub_cli/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ def get_logo_data():
return logo_data


def generate_html_report(validation_results, validation_date, submission_dir, vcf_fasta_analysis_mapping, project_title=None):
def generate_html_report(validation_results, validation_date, submission_dir, vcf_fasta_analysis_mapping,
project_title=None):
vcf_files = sorted(set([file_name
for check in validation_results if check in ["vcf_check", "assembly_check"]
for file_name in validation_results[check]
Expand All @@ -32,7 +33,7 @@ def generate_html_report(validation_results, validation_date, submission_dir, vc
fasta_files=fasta_files,
submission_dir=submission_dir,
vcf_fasta_analysis_mapping=vcf_fasta_analysis_mapping,
validation_results=validation_results,
validation_results=validation_results
)

try:
Expand Down
191 changes: 191 additions & 0 deletions eva_sub_cli/validators/validation_results_parsers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
import re

from ebi_eva_common_pyutils.logger import logging_config

logger = logging_config.get_logger(__name__)

def parse_assembly_check_log(assembly_check_log):
error_list = []
nb_error, nb_mismatch = 0, 0
match = total = None
with open(assembly_check_log) as open_file:
for line in open_file:
if line.startswith('[error]'):
nb_error += 1
if nb_error < 11:
error_list.append(line.strip()[len('[error] '):])
elif line.startswith('[info] Number of matches:'):
match, total = line.strip()[len('[info] Number of matches: '):].split('/')
match = int(match)
total = int(total)
return error_list, nb_error, match, total


def parse_assembly_check_report(assembly_check_report):
mismatch_list = []
nb_mismatch = 0
nb_error = 0
error_list = []
with open(assembly_check_report) as open_file:
for line in open_file:
if 'does not match the reference sequence' in line:
nb_mismatch += 1
if nb_mismatch < 11:
mismatch_list.append(line.strip())
elif 'Multiple synonyms' in line:
nb_error += 1
if nb_error < 11:
error_list.append(line.strip())
# Contig not found in FASTA is reported here rather than in logs when no assembly report is used.
# Count and report once per contig name rather than once per line, to avoid redundant errors.
elif 'is not present in FASTA file' in line:
line_num, error_msg = line.split(': ')
error_msg = error_msg.strip()
if error_msg not in error_list:
nb_error += 1
if nb_error < 11:
error_list.append(error_msg)
return mismatch_list, nb_mismatch, error_list, nb_error


def parse_vcf_check_report(vcf_check_report):
valid = True
max_error_reported = 10
error_list, critical_list = [], []
warning_count = error_count = critical_count = 0
with open(vcf_check_report) as open_file:
for line in open_file:
if 'warning' in line:
warning_count = 1
elif line.startswith('According to the VCF specification'):
if 'not' in line:
valid = False
elif vcf_check_errors_is_critical(line.strip()):
critical_count += 1
if critical_count <= max_error_reported:
critical_list.append(line.strip())
else:
error_count += 1
if error_count <= max_error_reported:
error_list.append(line.strip())

return valid, warning_count, error_count, critical_count, error_list, critical_list


def vcf_check_errors_is_critical(error):
"""
This function identify VCF check errors that are not critical for the processing of the VCF within EVA.
They affect specific INFO or FORMAT fields that are used in the variant detection but less so in the downstream
analysis.
Critical:
Reference and alternate alleles must not be the same.
Requested evidence presence with --require-evidence. Please provide genotypes (GT field in FORMAT and samples),
or allele frequencies (AF field in INFO), or allele counts (AC and AN fields in INFO)..
Contig is not sorted by position. Contig chr10 position 41695506 found after 41883113.
Duplicated variant chr1A:1106203:A>G found.
Metadata description string is not valid.
Error
Sample #10, field PL does not match the meta specification Number=G (expected 2 value(s)). PL=.. It must derive
its number of values from the ploidy of GT (if present), or assume diploidy. Contains 1 value(s), expected 2
(derived from ploidy 1).
Sample #102, field AD does not match the meta specification Number=R (expected 3 value(s)). AD=..
"""
non_critical_format_fields = ['PL', 'AD', 'AC']
non_critical_info_fields = ['AC']
regexes = {
r'^INFO (\w+) does not match the specification Number': non_critical_format_fields,
r'^Sample #\d+, field (\w+) does not match the meta specification Number=': non_critical_info_fields
}
for regex in regexes:
match = re.match(regex, error)
if match:
field_affected = match.group(1)
if field_affected in regexes[regex]:
return False
return True


def parse_biovalidator_validation_results(metadata_check_file):
"""
Read the biovalidator's report and extract the list of validation errors
"""
ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')

def clean_read(ifile):
l = ifile.readline()
if l:
return ansi_escape.sub('', l).strip()

if not metadata_check_file:
return

errors = []

with open(metadata_check_file) as open_file:
collect = False
while True:
line = clean_read(open_file)
if line is None:
break # EOF
elif not line:
continue # Empty line
if not collect:
if line.startswith('Validation failed with following error(s):'):
collect = True
else:
line2 = clean_read(open_file)
if line is None or line2 is None:
break # EOF
errors.append({'property': line, 'description': line2})
return errors


def convert_metadata_sheet(json_attribute, xls2json_conf):
if json_attribute is None:
return None
for sheet_name in xls2json_conf['worksheets']:
if xls2json_conf['worksheets'][sheet_name] == json_attribute:
return sheet_name


def convert_metadata_row(sheet, json_row, xls2json_conf):
if json_row is None:
return ''
if 'header_row' in xls2json_conf[sheet]:
return int(json_row) + xls2json_conf[sheet]['header_row']
else:
return int(json_row) + 2


def convert_metadata_attribute(sheet, json_attribute, xls2json_conf):
if json_attribute is None:
return ''
attributes_dict = {}
attributes_dict.update(xls2json_conf[sheet].get('required', {}))
attributes_dict.update(xls2json_conf[sheet].get('optional', {}))
for attribute in attributes_dict:
if attributes_dict[attribute] == json_attribute:
return attribute


def parse_metadata_property(property_str):
if property_str.startswith('.'):
return property_str.strip('./'), None, None
# First attempt to parse as BioSample object
sheet, row, col = parse_sample_metadata_property(property_str)
if sheet is not None and row is not None and col is not None:
return sheet, row, col
match = re.match(r'/(\w+)(/(\d+))?([./](\w+))?', property_str)
if match:
return match.group(1), match.group(3), match.group(5)
else:
logger.error(f'Cannot parse {property_str} in JSON metadata error')
return None, None, None


def parse_sample_metadata_property(property_str):
match = re.match(r'/sample/(\d+)/bioSampleObject/characteristics/(\w+)', property_str)
if match:
return 'sample', match.group(1), match.group(2)
return None, None, None
Loading

0 comments on commit facfd73

Please sign in to comment.