Skip to content

Commit

Permalink
Merge pull request #93 from jorizci/EVA-618
Browse files Browse the repository at this point in the history
Parameterize the use of mapping files for aggregated VCF
  • Loading branch information
Cristina Yenyxe Gonzalez Garcia authored Feb 24, 2017
2 parents 918fd88 + ebc5194 commit e94df1b
Show file tree
Hide file tree
Showing 36 changed files with 357 additions and 110 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import uk.ac.ebi.eva.pipeline.io.readers.VcfReader;
import uk.ac.ebi.eva.pipeline.parameters.InputParameters;

import java.io.File;
import java.io.IOException;

import static uk.ac.ebi.eva.pipeline.configuration.BeanNames.VARIANT_READER;
Expand Down Expand Up @@ -54,11 +55,14 @@ public ItemStreamReader<Variant> unwindingReader(VcfReader vcfReader) throws Exc
public VcfReader vcfReader(InputParameters parameters) throws IOException {
String fileId = parameters.getVcfId();
String studyId = parameters.getStudyId();
File vcfFile = new File(parameters.getVcf());
VariantSource.Aggregation vcfAggregation = parameters.getVcfAggregation();

if (VariantSource.Aggregation.NONE.equals(parameters.getVcfAggregation())) {
return new VcfReader(fileId, studyId, parameters.getVcf());
if (VariantSource.Aggregation.NONE.equals(vcfAggregation)) {
return new VcfReader(fileId, studyId, vcfFile);
} else {
return new AggregatedVcfReader(fileId, studyId, parameters.getVcfAggregation(), parameters.getVcf());
return new AggregatedVcfReader(fileId, studyId, vcfAggregation, parameters.getAggregatedMappingFile(),
vcfFile);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,13 @@

import org.opencb.biodata.models.variant.VariantSource;
import org.springframework.batch.item.file.LineMapper;
import org.springframework.util.Assert;
import uk.ac.ebi.eva.commons.models.data.Variant;
import uk.ac.ebi.eva.utils.FileUtils;

import java.io.IOException;
import java.util.List;
import java.util.Properties;

import static org.junit.Assert.assertNotNull;

Expand All @@ -34,18 +38,29 @@ public class AggregatedVcfLineMapper implements LineMapper<List<Variant>> {
private final String studyId;
private VariantVcfFactory factory;

public AggregatedVcfLineMapper(String fileId, String studyId, VariantSource.Aggregation aggregation) {
public AggregatedVcfLineMapper(String fileId, String studyId, VariantSource.Aggregation aggregation,
String mappingFilePath) throws IOException {
Assert.notNull(fileId);
Assert.notNull(studyId);
Assert.notNull(aggregation);

this.fileId = fileId;
this.studyId = studyId;

Properties mappings = null;
if(mappingFilePath!=null){
mappings = FileUtils.getPropertiesFile(mappingFilePath);
}

switch (aggregation) {
case EVS:
factory = new VariantVcfEVSFactory();
factory = new VariantVcfEVSFactory(mappings);
break;
case EXAC:
factory = new VariantVcfExacFactory();
factory = new VariantVcfExacFactory(mappings);
break;
case BASIC:
factory = new VariantAggregatedVcfFactory();
factory = new VariantAggregatedVcfFactory(mappings);
break;
case NONE:
throw new IllegalArgumentException(
Expand All @@ -57,7 +72,7 @@ public AggregatedVcfLineMapper(String fileId, String studyId, VariantSource.Aggr
@Override
public List<Variant> mapLine(String line, int lineNumber) throws Exception {
assertNotNull(this.getClass().getSimpleName() + " should be used to read aggregated VCFs only " +
"(hint: do not set VariantSource.Aggregation to NONE)", factory);
"(hint: do not set VariantSource.Aggregation to NONE)", factory);
return factory.create(fileId, studyId, line);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -62,29 +62,42 @@ public VariantAggregatedVcfFactory() {
}

/**
* @param tagMap Properties that contains case-sensitive tag mapping for aggregation data. A valid example structure
* of this file is:
* <pre>
* {@code
* @param mappings Properties that contains case-sensitive tag mapping for aggregation data. A valid example
* structure of this file is:
* <pre>
* {@code
*
* EUR.AF=EUR_AF
* EUR.AC=AC_EUR
* EUR.AN=EUR_AN
* EUR.GTC=EUR_GTC
* ALL.AF=AF
* ALL.AC=TAC
* ALL.AN=AN
* ALL.GTC=GTC
* }
* </pre>
* <p>
* <p>
* where the right side of the '=' is how the values appear in the vcf, and left side is how it will loaded. It must
* be a bijection, i.e. there must not be repeated entries in any side. The part before the '.' can be any string
* naming the group. The part after the '.' must be one of AF, AC, AN or GTC.
* EUR.AF=EUR_AF
* EUR.AC=AC_EUR
* EUR.AN=EUR_AN
* EUR.GTC=EUR_GTC
* ALL.AF=AF
* ALL.AC=TAC
* ALL.AN=AN
* ALL.GTC=GTC
* }
* </pre>
* <p>
* <p>
* where the right side of the '=' is how the values appear in the vcf, and left side is how it will
* loaded. It must be a bijection, i.e. there must not be repeated entries in any side. The part
* before the '.' can be any string naming the group. The part after the '.' must be one of AF,
* AC, AN or GTC.
*/
public VariantAggregatedVcfFactory(Properties tagMap) {
this.tagMap = tagMap;
public VariantAggregatedVcfFactory(Properties mappings) {
if (mappings == null) {
loadDefaultMappings();
} else {
loadMappings(mappings);
}
}

protected void loadDefaultMappings() {
// No default mapping.
}

protected void loadMappings(Properties mappings) {
this.tagMap = mappings;
if (tagMap != null) {
this.reverseTagMap = new LinkedHashMap<>(tagMap.size());
for (String tag : tagMap.stringPropertyNames()) {
Expand Down Expand Up @@ -152,10 +165,11 @@ protected void parseStats(Variant variant, String fileId, String studyId, int nu
file.setStats(vs);
}

protected void parseCohortStats(Variant variant, String fileId, String studyId, int numAllele, String[] alternateAlleles,
String info) {
protected void parseCohortStats(Variant variant, String fileId, String studyId, int numAllele,
String[] alternateAlleles, String info) {
VariantSourceEntry file = variant.getSourceEntry(fileId, studyId);
Map<String, Map<String, String>> cohortStats = new LinkedHashMap<>(); // cohortName -> (statsName -> statsValue): EUR->(AC->3,2)
Map<String, Map<String, String>> cohortStats = new LinkedHashMap<>();
// cohortName -> (statsName -> statsValue): EUR->(AC->3,2)
String[] splittedInfo = info.split(";");
for (String attribute : splittedInfo) {
String[] assignment = attribute.split("=");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,20 @@
import uk.ac.ebi.eva.commons.models.data.Variant;
import uk.ac.ebi.eva.commons.models.data.VariantSourceEntry;
import uk.ac.ebi.eva.commons.models.data.VariantStats;
import uk.ac.ebi.eva.utils.FileUtils;

import java.io.File;
import java.io.IOException;
import java.util.Properties;
import java.util.Set;

/**
* Overrides the methods in VariantAggregatedVcfFactory that take care of the fields QUAL, FILTER and INFO, to support
* Overrides the methods in VariantAggregatedVcfFactory that take care of the fields QUAL, FILTER and INFO, to support
* the specific format of Exome Variant Server VCFs.
*/
public class VariantVcfEVSFactory extends VariantAggregatedVcfFactory {

private static final String EVS_MAPPING_FILE = "/mappings/evs-mapping.properties";

public VariantVcfEVSFactory() {
this(null);
Expand Down Expand Up @@ -61,10 +65,19 @@ public VariantVcfEVSFactory(Properties tagMap) {
super(tagMap);
}

@Override
protected void loadDefaultMappings() {
try {
loadMappings(FileUtils.getPropertiesFile(FileUtils.getResource(EVS_MAPPING_FILE).getAbsolutePath()));
} catch (IOException e) {
throw new RuntimeException(e);
}
}

@Override
protected void setOtherFields(Variant variant, String fileId, String studyId, Set<String> ids, float quality, String filter,
String info, String format, int numAllele, String[] alternateAlleles, String line) {
protected void setOtherFields(Variant variant, String fileId, String studyId, Set<String> ids, float quality,
String filter, String info, String format, int numAllele, String[] alternateAlleles,
String line) {
// Fields not affected by the structure of REF and ALT fields
variant.setIds(ids);
VariantSourceEntry sourceEntry = variant.getSourceEntry(fileId, studyId);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@
import uk.ac.ebi.eva.commons.models.data.Variant;
import uk.ac.ebi.eva.commons.models.data.VariantSourceEntry;
import uk.ac.ebi.eva.commons.models.data.VariantStats;
import uk.ac.ebi.eva.utils.FileUtils;

import java.io.File;
import java.io.IOException;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Properties;
Expand All @@ -43,6 +46,8 @@ public class VariantVcfExacFactory extends VariantAggregatedVcfFactory {

private static final String COMMA = ",";

private static final String EXAC_MAPPING_FILE = "/mappings/exac-mapping.properties";

public VariantVcfExacFactory() {
this(null);
}
Expand Down Expand Up @@ -72,6 +77,14 @@ public VariantVcfExacFactory(Properties tagMap) {
super(tagMap);
}

@Override
protected void loadDefaultMappings() {
try {
loadMappings(FileUtils.getPropertiesFile(FileUtils.getResource(EXAC_MAPPING_FILE).getAbsolutePath()));
} catch (IOException e) {
throw new RuntimeException(e);
}
}

@Override
protected void parseStats(Variant variant, String fileId, String studyId, int numAllele, String[] alternateAlleles,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import java.io.File;
import java.io.IOException;
import java.util.Optional;

/**
* VCF file reader for VCFs without genotypes (aggregated VCFs).
Expand All @@ -31,13 +32,8 @@
*/
public class AggregatedVcfReader extends VcfReader {

public AggregatedVcfReader(String fileId, String studyId, VariantSource.Aggregation aggregation, String file)
throws IOException {
this(fileId, studyId, aggregation, new File(file));
}

public AggregatedVcfReader(String fileId, String studyId, VariantSource.Aggregation aggregation, File file)
throws IOException {
super(new AggregatedVcfLineMapper(fileId, studyId, aggregation), file);
public AggregatedVcfReader(String fileId, String studyId, VariantSource.Aggregation aggregation,
String mappingFilePath, File file) throws IOException {
super(new AggregatedVcfLineMapper(fileId, studyId, aggregation, mappingFilePath), file);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ public class InputParameters {

private static final String PARAMETER = "#{jobParameters['";
private static final String END = "']}";
private static final String OR_NULL = "']?:null}";

@Value(PARAMETER + JobParametersNames.INPUT_STUDY_ID + END)
private String studyId;
Expand All @@ -55,13 +56,8 @@ public class InputParameters {
@Value(PARAMETER + JobParametersNames.INPUT_STUDY_TYPE + END)
private VariantStudy.StudyType studyType;

// maybe the next three could go into a ConfigurationParameters?

@Value(PARAMETER + JobParametersNames.APP_OPENCGA_PATH + END)
private String opencgaAppHome;

@Value(PARAMETER + JobParametersNames.CONFIG_RESTARTABILITY_ALLOW + "']?:false}")
private boolean allowStartIfComplete;
@Value(PARAMETER + JobParametersNames.INPUT_VCF_AGGREGATION_MAPPING_PATH + OR_NULL)
private String aggregatedMappingFile;

@Value(PARAMETER + JobParametersNames.CONFIG_CHUNK_SIZE + "']?:1000}")
private int chunkSize;
Expand Down Expand Up @@ -90,14 +86,6 @@ public VariantStudy.StudyType getStudyType() {
return studyType;
}

public String getOpencgaAppHome() {
return opencgaAppHome;
}

public boolean isAllowStartIfComplete() {
return allowStartIfComplete;
}

public int getChunkSize() {
return chunkSize;
}
Expand All @@ -109,4 +97,8 @@ public String getGtf() {
public String getPedigree() {
return pedigree;
}

public String getAggregatedMappingFile() {
return aggregatedMappingFile;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,7 @@
*/
package uk.ac.ebi.eva.pipeline.parameters;

import org.opencb.datastore.core.ObjectMap;
import org.opencb.opencga.lib.common.Config;
import org.opencb.opencga.storage.core.variant.VariantStorageManager;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ public class JobParametersNames {

public static final String INPUT_FASTA = "input.fasta";

public static final String INPUT_VCF_AGGREGATION_MAPPING_PATH = "input.vcf.aggregation.mapping-path";

/*
* Output
*/
Expand Down Expand Up @@ -137,6 +139,7 @@ public class JobParametersNames {

public static final String CONFIG_CHUNK_SIZE = "config.chunk.size";


public static final String PROPERTY_FILE_PROPERTY = "parameters.path";

public static final String RESTART_PROPERTY = "force.restart";
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/*
* Copyright 2017 EMBL - European Bioinformatics Institute
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package uk.ac.ebi.eva.pipeline.parameters.validation;

import org.springframework.batch.core.JobParameters;
import org.springframework.batch.core.JobParametersInvalidException;
import org.springframework.batch.core.JobParametersValidator;
import uk.ac.ebi.eva.pipeline.parameters.JobParametersNames;

/**
* Checks that the aggregated mapping file exists and is readable.
*
* @throws JobParametersInvalidException If the file is not a valid path, does not exist or is not readable
*/
public class InputVcfAggregationMappingPathValidator implements JobParametersValidator {
@Override
public void validate(JobParameters parameters) throws JobParametersInvalidException {
ParametersValidatorUtil.checkFileExists(parameters.getString(
JobParametersNames.INPUT_VCF_AGGREGATION_MAPPING_PATH),
JobParametersNames.INPUT_VCF_AGGREGATION_MAPPING_PATH);
ParametersValidatorUtil.checkFileIsReadable(parameters.getString(
JobParametersNames.INPUT_VCF_AGGREGATION_MAPPING_PATH),
JobParametersNames.INPUT_VCF_AGGREGATION_MAPPING_PATH);
}
}
Loading

0 comments on commit e94df1b

Please sign in to comment.