Skip to content

Commit

Permalink
Closes #1428: Fuzzy citation matching input and output transformers d…
Browse files Browse the repository at this point in the history
…o not remove output dir at startup
  • Loading branch information
marekhorst committed Aug 18, 2023
1 parent 363dfa1 commit c4ff3fd
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 2 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package eu.dnetlib.iis.wf.citationmatching.input;

import java.io.IOException;
import java.util.Collections;

import org.apache.commons.lang3.StringUtils;
Expand All @@ -17,6 +18,7 @@
import eu.dnetlib.iis.citationmatching.schemas.DocumentMetadata;
import eu.dnetlib.iis.common.WorkflowRuntimeParameters;
import eu.dnetlib.iis.common.citations.schemas.Citation;
import eu.dnetlib.iis.common.java.io.HdfsUtils;
import eu.dnetlib.iis.transformers.metadatamerger.schemas.ExtractedDocumentMetadataMergedWithOriginal;
import pl.edu.icm.sparkutils.avro.SparkAvroLoader;
import pl.edu.icm.sparkutils.avro.SparkAvroSaver;
Expand All @@ -37,7 +39,7 @@ public class CitationMatchingInputTransformerJob {

//------------------------ LOGIC --------------------------

public static void main(String[] args) throws InterruptedException {
public static void main(String[] args) throws InterruptedException, IOException {

CitationMatchingInputTransformerJobParameters params = new CitationMatchingInputTransformerJobParameters();
JCommander jcommander = new JCommander(params);
Expand All @@ -48,6 +50,8 @@ public static void main(String[] args) throws InterruptedException {
conf.set("spark.kryo.registrator", "pl.edu.icm.sparkutils.avro.AvroCompatibleKryoRegistrator");

try (JavaSparkContext sc = new JavaSparkContext(conf)) {

HdfsUtils.remove(sc.hadoopConfiguration(), params.output);

JavaRDD<ExtractedDocumentMetadataMergedWithOriginal> inputDocuments = avroLoader.loadJavaRDD(sc, params.inputMetadata, ExtractedDocumentMetadataMergedWithOriginal.class);

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package eu.dnetlib.iis.wf.citationmatching.output;

import java.io.IOException;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
Expand All @@ -9,6 +11,7 @@
import com.beust.jcommander.Parameters;

import eu.dnetlib.iis.citationmatching.schemas.Citation;
import eu.dnetlib.iis.common.java.io.HdfsUtils;
import pl.edu.icm.sparkutils.avro.SparkAvroLoader;
import pl.edu.icm.sparkutils.avro.SparkAvroSaver;

Expand All @@ -30,7 +33,7 @@ public class CitationMatchingOutputTransformerJob {

//------------------------ LOGIC --------------------------

public static void main(String[] args) {
public static void main(String[] args) throws IOException {

CitationMatchingOutputTransformerJobParameters params = new CitationMatchingOutputTransformerJobParameters();
JCommander jcommander = new JCommander(params);
Expand All @@ -41,6 +44,8 @@ public static void main(String[] args) {

try (JavaSparkContext sc = new JavaSparkContext(conf)) {

HdfsUtils.remove(sc.hadoopConfiguration(), params.output);

JavaRDD<Citation> inputCitations = avroLoader.loadJavaRDD(sc, params.input, Citation.class);


Expand Down

0 comments on commit c4ff3fd

Please sign in to comment.