From 5a4938afdfa512c755c9a665712d7c6691fd13bb Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Wed, 6 Nov 2024 23:00:59 +0200 Subject: [PATCH] Refactor cmd-arguments-handling. --- .../PublicationsRetriever.java | 244 ++-------------- .../crawler/PageCrawler.java | 4 +- .../util/args/ArgsUtils.java | 264 ++++++++++++++++++ .../util/file/FileUtils.java | 5 +- .../util/http/ConnSupportUtils.java | 5 +- .../util/http/HttpConnUtils.java | 6 +- .../util/url/DataToBeLogged.java | 4 +- .../util/url/LoaderAndChecker.java | 9 +- .../test/TestNonStandardInputOutput.java | 39 +-- 9 files changed, 324 insertions(+), 256 deletions(-) create mode 100644 src/main/java/eu/openaire/publications_retriever/util/args/ArgsUtils.java diff --git a/src/main/java/eu/openaire/publications_retriever/PublicationsRetriever.java b/src/main/java/eu/openaire/publications_retriever/PublicationsRetriever.java index 55ac740..b1f8286 100644 --- a/src/main/java/eu/openaire/publications_retriever/PublicationsRetriever.java +++ b/src/main/java/eu/openaire/publications_retriever/PublicationsRetriever.java @@ -3,6 +3,7 @@ import eu.openaire.publications_retriever.crawler.MachineLearning; import eu.openaire.publications_retriever.crawler.MetadataHandler; import eu.openaire.publications_retriever.crawler.PageCrawler; +import eu.openaire.publications_retriever.util.args.ArgsUtils; import eu.openaire.publications_retriever.util.file.FileUtils; import eu.openaire.publications_retriever.util.http.ConnSupportUtils; import eu.openaire.publications_retriever.util.http.DomainConnectionData; @@ -12,17 +13,20 @@ import eu.openaire.publications_retriever.util.url.LoaderAndChecker; import eu.openaire.publications_retriever.util.url.UrlTypeChecker; import eu.openaire.publications_retriever.util.url.UrlUtils; -import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.*; +import java.io.BufferedInputStream; +import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; import java.text.DecimalFormat; import java.time.Duration; import java.time.Instant; -import java.util.*; +import java.util.Comparator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -40,24 +44,11 @@ public class PublicationsRetriever { private static final Logger logger = LoggerFactory.getLogger(PublicationsRetriever.class); - private static int initialNumOfDocFile = 0; - - public static boolean docFilesStorageGivenByUser = false; - - public static boolean inputFromUrl = false; - public static String inputDataUrl = null; - - public static InputStream inputStream = null; - public static String inputFileFullPath = null; - public static Instant startTime = null; - public static String targetUrlType = "docOrDatasetUrl"; // docUrl, documentUrl, docOrDatasetUrl ; this is set by the args-parser, and it's used when outputting data. public static final DecimalFormat df = new DecimalFormat("0.00"); public static ExecutorService executor; - public static int workerThreadsCount = 0; - public static int threadsMultiplier = 2; // Use *3 without downloading docFiles and when having the domains to appear in uniform distribution in the inputFile. Use *2 when downloading. public static void main( String[] args ) @@ -70,7 +61,7 @@ public static void main( String[] args ) startTime = Instant.now(); - parseArgs(args); + ArgsUtils.parseArgs(args); if ( ! GenericUtils.checkInternetConnectivity() ) { FileUtils.closeIO(); @@ -82,13 +73,13 @@ public static void main( String[] args ) UrlTypeChecker.setURLDirectoryFilterRegex(); // Check if the user gave the input file in the commandLineArgument, if not, then check for other options. - if ( PublicationsRetriever.inputStream == null ) { - if ( PublicationsRetriever.inputFromUrl ) - PublicationsRetriever.inputStream = ConnSupportUtils.getInputStreamFromInputDataUrl(); + if ( ArgsUtils.inputStream == null ) { + if ( ArgsUtils.inputFromUrl ) + ArgsUtils.inputStream = ConnSupportUtils.getInputStreamFromInputDataUrl(); else - PublicationsRetriever.inputStream = new BufferedInputStream(System.in, FileUtils.fiveMb); + ArgsUtils.inputStream = new BufferedInputStream(System.in, FileUtils.fiveMb); } else { - try ( Stream linesStream = Files.lines(Paths.get(PublicationsRetriever.inputFileFullPath)) ) { + try ( Stream linesStream = Files.lines(Paths.get(ArgsUtils.inputFileFullPath)) ) { FileUtils.numOfLines = linesStream.count(); logger.info("The numOfLines in the inputFile is " + FileUtils.numOfLines); } catch (IOException ioe) { @@ -97,21 +88,21 @@ public static void main( String[] args ) } // Use standard input/output. - new FileUtils(inputStream, System.out); + new FileUtils(ArgsUtils.inputStream, System.out); if ( MachineLearning.useMLA ) new MachineLearning(); - if ( workerThreadsCount == 0 ) { // If the user did not provide the "workerThreadsCount", then get the available number from the system. + if ( ArgsUtils.workerThreadsCount == 0 ) { // If the user did not provide the "workerThreadsCount", then get the available number from the system. int availableThreads = Runtime.getRuntime().availableProcessors(); - availableThreads *= threadsMultiplier; + availableThreads *= ArgsUtils.threadsMultiplier; // If the domains of the urls in the inputFile, are in "uniform distribution" (each one of them to be equally likely to appear in any place), then the more threads the better (triple the computer's number) // Else, if there are far lees domains and/or closely placed inside the inputFile.. then use only the number of threads provided by the computer, since the "politenessDelay" will block them more than the I/O would ever do.. - workerThreadsCount = availableThreads; // Due to I/O, blocking the threads all the time, more threads handle the workload faster.. + ArgsUtils.workerThreadsCount = availableThreads; // Due to I/O, blocking the threads all the time, more threads handle the workload faster.. } - logger.info("Use " + workerThreadsCount + " worker-threads."); - executor = Executors.newFixedThreadPool(workerThreadsCount); + logger.info("Use " + ArgsUtils.workerThreadsCount + " worker-threads."); + executor = Executors.newFixedThreadPool(ArgsUtils.workerThreadsCount); try { new LoaderAndChecker(); @@ -148,197 +139,6 @@ public static void main( String[] args ) } - public static void parseArgs(String[] mainArgs) - { - String usageMessage = "\nUsage: java -jar publications_retriever-.jar -retrieveDataType -inputFileFullPath inputFile -downloadDocFiles(OPTIONAL) -docFileNameType(OPTIONAL) -firstDocFileNum(OPTIONAL) 'num' -docFilesStorage(OPTIONAL) 'storageDir' -inputDataUrl(OPTIONAL) 'inputUrl' -numOfThreads(OPTIONAL) 'threadsNum' < 'input' > 'output'"; - - if ( mainArgs.length > 15 ) { - String errMessage = "\"PublicationsRetriever\" expected only up to 15 arguments, while you gave: " + mainArgs.length + "!" + usageMessage; - logger.error(errMessage); - System.err.println(errMessage); - System.exit(-1); - } - - boolean firstNumGiven = false; - - for ( short i = 0; i < mainArgs.length; i++ ) - { - try { - switch ( mainArgs[i] ) { - case "-retrieveDataType": - i ++; - String dataType = mainArgs[i]; - switch (dataType) { - case "document": - logger.info("Going to retrieve only records of \"document\"-type."); - LoaderAndChecker.retrieveDocuments = true; - LoaderAndChecker.retrieveDatasets = false; - targetUrlType = "docUrl"; - break; - case "dataset": - logger.info("Going to retrieve only records of \"dataset\"-type."); - LoaderAndChecker.retrieveDocuments = false; - LoaderAndChecker.retrieveDatasets = true; - targetUrlType = "datasetUrl"; - break; - case "all": - logger.info("Going to retrieve records of all types (documents and datasets)."); - LoaderAndChecker.retrieveDocuments = true; - LoaderAndChecker.retrieveDatasets = true; - targetUrlType = "docOrDatasetUrl"; - break; - default: - String errMessage = "Argument: \"" + dataType + "\" was invalid!\nExpected one of the following: \"docFiles | datasets | all\"" + usageMessage; - System.err.println(errMessage); - logger.error(errMessage); - System.exit(9); - } - break; - case "-inputFileFullPath": - i ++; - inputFileFullPath = mainArgs[i]; - if ( !(inputFileFullPath.startsWith(File.separator) || inputFileFullPath.startsWith("~")) ) - { - if ( inputFileFullPath.startsWith("." + File.separator) ) // Remove the starting "dot" + "/" or "\", if exists. - inputFileFullPath = StringUtils.replace(inputFileFullPath, "." + File.separator, "", 1); - - inputFileFullPath = System.getProperty("user.dir") + File.separator + inputFileFullPath; // In case the given path starts with "..", then this also works. - } - try { - inputStream = new BufferedInputStream(new FileInputStream(inputFileFullPath), FileUtils.fiveMb); - } catch (FileNotFoundException fnfe) { - String errMessage = "No inputFile was found in \"" + inputFileFullPath + "\""; - logger.error(errMessage); - System.err.println(errMessage); - System.exit(-144); - } catch (Exception e) { - String errMessage = e.toString(); - logger.error(errMessage); - System.err.println(errMessage); - System.exit(-145); - } - break; - case "-downloadDocFiles": - FileUtils.shouldDownloadDocFiles = true; - break; - case "-docFileNameType": - i ++; - String nameType = mainArgs[i]; - switch ( nameType ) { - case "originalName": - logger.info("Going to use the \"originalName\" type."); - FileUtils.docFileNameType = FileUtils.DocFileNameType.originalName; - break; - case "idName": - if ( !LoaderAndChecker.useIdUrlPairs ) { - String errMessage = "You provided the \"DocFileNameType.idName\", but the program's reader is not set to retrieve IDs from the inputFile! Set the program to retrieve IDs by setting the \"utils.url.LoaderAndChecker.useIdUrlPairs\"-variable to \"true\"."; - System.err.println(errMessage); - logger.error(errMessage); - System.exit(10); - } else { - logger.info("Going to use the \"idName\" type."); - FileUtils.docFileNameType = FileUtils.DocFileNameType.idName; - } - break; - case "numberName": - logger.info("Going to use the \"numberName\" type."); - FileUtils.docFileNameType = FileUtils.DocFileNameType.numberName; - break; - default: - String errMessage = "Invalid \"docFileNameType\" given (\"" + nameType + "\")\nExpected one of the following: \"originalName | idName | numberName\"" + usageMessage; - System.err.println(errMessage); - logger.error(errMessage); - System.exit(11); - } - break; - case "-firstDocFileNum": - try { - i ++; // Go get the following first-Number-argument. - FileUtils.numOfDocFile = PublicationsRetriever.initialNumOfDocFile = Integer.parseInt(mainArgs[i]); // We use both variables in statistics. - if ( PublicationsRetriever.initialNumOfDocFile <= 0 ) { - logger.warn("The given \"initialNumOfDocFile\" (" + PublicationsRetriever.initialNumOfDocFile + ") was a number less or equal to zero! Setting that number to <1> and continuing downloading.."); - PublicationsRetriever.initialNumOfDocFile = 1; - } - firstNumGiven = true; - break; - } catch (NumberFormatException nfe) { - String errorMessage = "Argument \"-firstDocFileNum\" must be followed by an integer value! Given one was: \"" + mainArgs[i] + "\"" + usageMessage; - System.err.println(errorMessage); - logger.error(errorMessage); - System.exit(-2); - } - case "-docFilesStorage": - i ++; - String storageDir = mainArgs[i]; - if ( storageDir.equals("S3ObjectStore") ) - FileUtils.shouldUploadFilesToS3 = true; - else - FileUtils.storeDocFilesDir = storageDir + (!storageDir.endsWith(File.separator) ? File.separator : ""); // Pre-process it.. otherwise, it may cause problems. - PublicationsRetriever.docFilesStorageGivenByUser = true; - break; - case "-inputDataUrl": - i++; - inputDataUrl = mainArgs[i]; - inputFromUrl = true; - logger.info("Using the inputFile from the URL: " + inputDataUrl); - break; - case "-numOfThreads": - i++; - String workerCountString = mainArgs[i]; - try { - workerThreadsCount = PublicationsRetriever.initialNumOfDocFile = Integer.parseInt(workerCountString); // We use both variables in statistics. - if ( workerThreadsCount < 1 ) { - logger.warn("The \"workerThreadsCount\" given was less than < 1 > (" + workerThreadsCount + "), continuing with < 1 > instead.."); - workerThreadsCount = 1; - } - } catch (NumberFormatException nfe) { - logger.error("Invalid \"workerThreadsCount\" was given: \"" + workerCountString + "\".\tContinue by using the system's available threads multiplied by " + threadsMultiplier); - } - break; - default: // log & ignore the argument - String errMessage = "Argument: \"" + mainArgs[i] + "\" was not expected!" + usageMessage; - System.err.println(errMessage); - logger.error(errMessage); - break; - } - } catch (ArrayIndexOutOfBoundsException aioobe) { - String errMessage = "The argument-set of \"" + mainArgs[i] + "\" was not complete!\nThe provided arguments are: " + Arrays.toString(mainArgs) + usageMessage; - System.err.println(errMessage); - logger.error(errMessage); - System.exit(90); - } - } - - if ( FileUtils.shouldDownloadDocFiles ) - { - if ( FileUtils.docFileNameType == null ) { - logger.warn("You did not specified the docNameType!" + usageMessage); - if ( LoaderAndChecker.useIdUrlPairs ) { - FileUtils.docFileNameType = FileUtils.DocFileNameType.idName; - logger.warn("The program will use the \"idName\"-type!"); - } else { - FileUtils.docFileNameType = FileUtils.DocFileNameType.numberName; - logger.warn("The program will use the \"numberName\"-type!"); - } - } - - if ( FileUtils.shouldUploadFilesToS3 && FileUtils.docFileNameType.equals(FileUtils.DocFileNameType.originalName) ) { - String baseMsg = "The uploading of the docFiles to the S3-ObjectStore requires the use of \"ID-names\" or \"Number-names\" for the DocFiles. You specified the \"originalName\" DocFileNameType."; - if ( LoaderAndChecker.useIdUrlPairs ) { - logger.warn(baseMsg + " Replacing the DocFileNameType \"originalName\" with \"idName\"."); - FileUtils.docFileNameType = FileUtils.DocFileNameType.idName; - } else { - logger.warn(baseMsg + " Replacing the DocFileNameType \"originalName\" with \"numberName\"."); - FileUtils.docFileNameType = FileUtils.DocFileNameType.numberName; - } - } - - if ( firstNumGiven && !FileUtils.docFileNameType.equals(FileUtils.DocFileNameType.numberName) ) - logger.warn("You provided the \"-firstDocFileNum\" a, but you also specified a \"docFileNameType\" of non numeric-type. The \"-firstDocFileNum\" will be ignored!" + usageMessage); - } - } - - public static void showStatistics(Instant startTime) { long inputCheckedUrlNum = 0; @@ -370,13 +170,13 @@ public static void showStatistics(Instant startTime) if ( SignalUtils.receivedSIGINT ) logger.warn("A SIGINT signal was received, so some of the \"checked-urls\" may have not been actually checked, that's more of a number of the \"loaded-urls\"."); - logger.info("Total " + targetUrlType + "s found: " + UrlUtils.sumOfDocUrlsFound + ". That's about: " + df.format(UrlUtils.sumOfDocUrlsFound.get() * 100.0 / inputCheckedUrlNum) + "% from the total numOfUrls checked. The rest were problematic or non-handleable url-cases."); + logger.info("Total " + ArgsUtils.targetUrlType + "s found: " + UrlUtils.sumOfDocUrlsFound + ". That's about: " + df.format(UrlUtils.sumOfDocUrlsFound.get() * 100.0 / inputCheckedUrlNum) + "% from the total numOfUrls checked. The rest were problematic or non-handleable url-cases."); if ( FileUtils.shouldDownloadDocFiles ) { int numOfStoredDocFiles = 0; if ( !FileUtils.docFileNameType.equals(FileUtils.DocFileNameType.numberName) ) // If we have anything different from the numberName-type.. numOfStoredDocFiles = FileUtils.numOfDocFiles.get(); else - numOfStoredDocFiles = FileUtils.numOfDocFile - initialNumOfDocFile; + numOfStoredDocFiles = FileUtils.numOfDocFile - ArgsUtils.initialNumOfDocFile; logger.info("From which docUrls, we were able to retrieve: " + numOfStoredDocFiles + " distinct docFiles. That's about: " + df.format(numOfStoredDocFiles * 100.0 / UrlUtils.sumOfDocUrlsFound.get()) + "%." + " The un-retrieved docFiles were either belonging to already-found docUrls or they had connection-issues."); } @@ -431,7 +231,7 @@ public static void showStatistics(Instant startTime) logger.debug("The number of paths blocked -due to HTTP 403- in total, was: " + ConnSupportUtils.domainsMultimapWithPaths403BlackListed.values().size()); calculateAndPrintElapsedTime(startTime, Instant.now(), null); - logger.debug("Used " + workerThreadsCount + " worker threads."); + logger.debug("Used " + ArgsUtils.workerThreadsCount + " worker threads."); if ( logger.isDebugEnabled() ) { diff --git a/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java b/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java index 1a5fb2b..b8e9e4e 100644 --- a/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java +++ b/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java @@ -1,7 +1,7 @@ package eu.openaire.publications_retriever.crawler; -import eu.openaire.publications_retriever.PublicationsRetriever; import eu.openaire.publications_retriever.exceptions.*; +import eu.openaire.publications_retriever.util.args.ArgsUtils; import eu.openaire.publications_retriever.util.http.ConnSupportUtils; import eu.openaire.publications_retriever.util.http.HttpConnUtils; import eu.openaire.publications_retriever.util.url.LoaderAndChecker; @@ -248,7 +248,7 @@ private static void handlePageWithNoDocUrls(String urlId, String sourceUrl, Stri UrlTypeChecker.pagesNotProvidingDocUrls.incrementAndGet(); if ( !isAlreadyLoggedToOutput ) // This check is used in error-cases, where we have already logged the Quadruple. - UrlUtils.logOutputData(urlId, sourceUrl, pageUrl, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Logged in 'PageCrawler.visit()' method, as no " + PublicationsRetriever.targetUrlType + " was found inside.", null, true, "true", "true", "false", "false", "false", null, "null"); + UrlUtils.logOutputData(urlId, sourceUrl, pageUrl, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Logged in 'PageCrawler.visit()' method, as no " + ArgsUtils.targetUrlType + " was found inside.", null, true, "true", "true", "false", "false", "false", null, "null"); if ( ConnSupportUtils.countAndBlockDomainAfterTimes(HttpConnUtils.blacklistedDomains, PageCrawler.timesDomainNotGivingDocUrls, pageDomain, PageCrawler.timesToGiveNoDocUrlsBeforeBlocked, true) ) logger.warn("Domain: \"" + pageDomain + "\" was blocked after giving no docUrls more than " + PageCrawler.timesToGiveNoDocUrlsBeforeBlocked + " times."); diff --git a/src/main/java/eu/openaire/publications_retriever/util/args/ArgsUtils.java b/src/main/java/eu/openaire/publications_retriever/util/args/ArgsUtils.java new file mode 100644 index 0000000..5d3f90f --- /dev/null +++ b/src/main/java/eu/openaire/publications_retriever/util/args/ArgsUtils.java @@ -0,0 +1,264 @@ +package eu.openaire.publications_retriever.util.args; + + +import eu.openaire.publications_retriever.util.file.FileUtils; +import eu.openaire.publications_retriever.util.url.LoaderAndChecker; +import org.apache.commons.lang3.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.*; +import java.util.Arrays; + + +/** + * @author Lampros Smyrnaios + */ +public class ArgsUtils { + + private static final Logger logger = LoggerFactory.getLogger(ArgsUtils.class); + + public static int initialNumOfDocFile = 0; + + public static boolean docFilesStorageGivenByUser = false; + + public static boolean inputFromUrl = false; + public static String inputDataUrl = null; + + public static InputStream inputStream = null; + public static String inputFileFullPath = null; + + public static String targetUrlType = "docOrDatasetUrl"; // docUrl, documentUrl, docOrDatasetUrl ; this is set by the args-parser, and it's used when outputting data. + + public static int workerThreadsCount = 0; + public static int threadsMultiplier = 2; // Use *3 without downloading docFiles and when having the domains to appear in uniform distribution in the inputFile. Use *2 when downloading. + + private static final String usageMessage = "\nUsage: java -jar publications_retriever-.jar -retrieveDataType -inputFileFullPath inputFile -downloadDocFiles(OPTIONAL) -docFileNameType(OPTIONAL) -firstDocFileNum(OPTIONAL) 'num' -docFilesStorage(OPTIONAL) 'storageDir' -inputDataUrl(OPTIONAL) 'inputUrl' -numOfThreads(OPTIONAL) 'threadsNum' < 'input' > 'output'"; + + private static boolean firstNumGiven = false; + + + public static void parseArgs(String[] mainArgs) + { + if ( mainArgs.length > 15 ) { + String errMessage = "\"PublicationsRetriever\" expected only up to 15 arguments, while you gave: " + mainArgs.length + "!" + usageMessage; + logger.error(errMessage); + System.err.println(errMessage); + System.exit(-1); + } + + for ( short i = 0; i < mainArgs.length; i++ ) + { + try { + switch ( mainArgs[i] ) { + case "-retrieveDataType": + i ++; + handleDatatypeArg(mainArgs[i]); + break; + case "-inputFileFullPath": + i ++; + handleFilePathArg(mainArgs[i]); + break; + case "-downloadDocFiles": + FileUtils.shouldDownloadDocFiles = true; + break; + case "-docFileNameType": + i ++; + handleFileNameType(mainArgs[i]); + break; + case "-firstDocFileNum": + i ++; + handleFirstDocFileNum(mainArgs[i]); + break; + case "-docFilesStorage": + i ++; + handleDocFilesStorage(mainArgs[i]); + break; + case "-inputDataUrl": + i++; + inputDataUrl = mainArgs[i]; + inputFromUrl = true; + logger.info("Using the inputFile from the URL: " + inputDataUrl); + break; + case "-numOfThreads": + i++; + handleNumThreads(mainArgs[i]); + break; + default: // log & ignore the argument + String errMessage = "Argument: \"" + mainArgs[i] + "\" was not expected!" + usageMessage; + System.err.println(errMessage); + logger.error(errMessage); + break; + } + } catch (ArrayIndexOutOfBoundsException aioobe) { + String errMessage = "The argument-set of \"" + mainArgs[i] + "\" was not complete!\nThe provided arguments are: " + Arrays.toString(mainArgs) + usageMessage; + System.err.println(errMessage); + logger.error(errMessage); + System.exit(90); + } + } + + if ( FileUtils.shouldDownloadDocFiles ) + handleDownloadCase(); + } + + + private static void handleDatatypeArg(String dataType) + { + //String dataType = mainArgs[i]; + switch (dataType) { + case "document": + logger.info("Going to retrieve only records of \"document\"-type."); + LoaderAndChecker.retrieveDocuments = true; + LoaderAndChecker.retrieveDatasets = false; + targetUrlType = "docUrl"; + break; + case "dataset": + logger.info("Going to retrieve only records of \"dataset\"-type."); + LoaderAndChecker.retrieveDocuments = false; + LoaderAndChecker.retrieveDatasets = true; + targetUrlType = "datasetUrl"; + break; + case "all": + logger.info("Going to retrieve records of all types (documents and datasets)."); + LoaderAndChecker.retrieveDocuments = true; + LoaderAndChecker.retrieveDatasets = true; + targetUrlType = "docOrDatasetUrl"; + break; + default: + String errMessage = "Argument: \"" + dataType + "\" was invalid!\nExpected one of the following: \"docFiles | datasets | all\"" + usageMessage; + System.err.println(errMessage); + logger.error(errMessage); + System.exit(9); + } + } + + + private static void handleFilePathArg(String filePathArg) + { + inputFileFullPath = filePathArg; + if ( !(inputFileFullPath.startsWith(File.separator) || inputFileFullPath.startsWith("~")) ) + { + if ( inputFileFullPath.startsWith("." + File.separator) ) // Remove the starting "dot" + "/" or "\", if exists. + inputFileFullPath = StringUtils.replace(inputFileFullPath, "." + File.separator, "", 1); + + inputFileFullPath = System.getProperty("user.dir") + File.separator + inputFileFullPath; // In case the given path starts with "..", then this also works. + } + try { + inputStream = new BufferedInputStream(new FileInputStream(inputFileFullPath), FileUtils.fiveMb); + } catch (FileNotFoundException fnfe) { + String errMessage = "No inputFile was found in \"" + inputFileFullPath + "\""; + logger.error(errMessage); + System.err.println(errMessage); + System.exit(-144); + } catch (Exception e) { + String errMessage = e.toString(); + logger.error(errMessage); + System.err.println(errMessage); + System.exit(-145); + } + } + + + private static void handleFileNameType(String nameType) + { + switch ( nameType ) { + case "originalName": + logger.info("Going to use the \"originalName\" type."); + FileUtils.docFileNameType = FileUtils.DocFileNameType.originalName; + break; + case "idName": + if ( !LoaderAndChecker.useIdUrlPairs ) { + String errMessage = "You provided the \"DocFileNameType.idName\", but the program's reader is not set to retrieve IDs from the inputFile! Set the program to retrieve IDs by setting the \"utils.url.LoaderAndChecker.useIdUrlPairs\"-variable to \"true\"."; + System.err.println(errMessage); + logger.error(errMessage); + System.exit(10); + } else { + logger.info("Going to use the \"idName\" type."); + FileUtils.docFileNameType = FileUtils.DocFileNameType.idName; + } + break; + case "numberName": + logger.info("Going to use the \"numberName\" type."); + FileUtils.docFileNameType = FileUtils.DocFileNameType.numberName; + break; + default: + String errMessage = "Invalid \"docFileNameType\" given (\"" + nameType + "\")\nExpected one of the following: \"originalName | idName | numberName\"" + usageMessage; + System.err.println(errMessage); + logger.error(errMessage); + System.exit(11); + } + } + + + private static void handleFirstDocFileNum(String initNumStr) + { + try { + FileUtils.numOfDocFile = initialNumOfDocFile = Integer.parseInt(initNumStr); // We use both variables in statistics. + if ( initialNumOfDocFile <= 0 ) { + logger.warn("The given \"initialNumOfDocFile\" (" + initialNumOfDocFile + ") was a number less or equal to zero! Setting that number to <1> and continuing downloading.."); + initialNumOfDocFile = 1; + } + firstNumGiven = true; + } catch (NumberFormatException nfe) { + String errorMessage = "Argument \"-firstDocFileNum\" must be followed by an integer value! Given one was: \"" + initNumStr + "\"" + usageMessage; + System.err.println(errorMessage); + logger.error(errorMessage); + System.exit(-2); + } + } + + + private static void handleDocFilesStorage(String docStorageDir) + { + docFilesStorageGivenByUser = true; + if ( docStorageDir.equals("S3ObjectStore") ) + FileUtils.shouldUploadFilesToS3 = true; + else + FileUtils.storeDocFilesDir = docStorageDir + (!docStorageDir.endsWith(File.separator) ? File.separator : ""); // Pre-process it.. otherwise, it may cause problems. + } + + + private static void handleNumThreads(String workerCountString) + { + try { + workerThreadsCount = initialNumOfDocFile = Integer.parseInt(workerCountString); // We use both variables in statistics. + if ( workerThreadsCount < 1 ) { + logger.warn("The \"workerThreadsCount\" given was less than < 1 > (" + workerThreadsCount + "), continuing with < 1 > instead.."); + workerThreadsCount = 1; + } + } catch (NumberFormatException nfe) { + logger.error("Invalid \"workerThreadsCount\" was given: \"" + workerCountString + "\".\tContinue by using the system's available threads multiplied by " + threadsMultiplier); + } + } + + + private static void handleDownloadCase() + { + if ( FileUtils.docFileNameType == null ) { + logger.warn("You did not specified the docNameType!" + usageMessage); + if ( LoaderAndChecker.useIdUrlPairs ) { + FileUtils.docFileNameType = FileUtils.DocFileNameType.idName; + logger.warn("The program will use the \"idName\"-type!"); + } else { + FileUtils.docFileNameType = FileUtils.DocFileNameType.numberName; + logger.warn("The program will use the \"numberName\"-type!"); + } + } + + if ( FileUtils.shouldUploadFilesToS3 && FileUtils.docFileNameType.equals(FileUtils.DocFileNameType.originalName) ) { + String baseMsg = "The uploading of the docFiles to the S3-ObjectStore requires the use of \"ID-names\" or \"Number-names\" for the DocFiles. You specified the \"originalName\" DocFileNameType."; + if ( LoaderAndChecker.useIdUrlPairs ) { + logger.warn(baseMsg + " Replacing the DocFileNameType \"originalName\" with \"idName\"."); + FileUtils.docFileNameType = FileUtils.DocFileNameType.idName; + } else { + logger.warn(baseMsg + " Replacing the DocFileNameType \"originalName\" with \"numberName\"."); + FileUtils.docFileNameType = FileUtils.DocFileNameType.numberName; + } + } + + if ( firstNumGiven && !FileUtils.docFileNameType.equals(FileUtils.DocFileNameType.numberName) ) + logger.warn("You provided the \"-firstDocFileNum\" a, but you also specified a \"docFileNameType\" of non numeric-type. The \"-firstDocFileNum\" will be ignored!" + usageMessage); + } + +} diff --git a/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java b/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java index 1eb7f57..00b88af 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java +++ b/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java @@ -5,6 +5,7 @@ import eu.openaire.publications_retriever.PublicationsRetriever; import eu.openaire.publications_retriever.crawler.MachineLearning; import eu.openaire.publications_retriever.exceptions.DocFileNotRetrievedException; +import eu.openaire.publications_retriever.util.args.ArgsUtils; import eu.openaire.publications_retriever.util.http.ConnSupportUtils; import eu.openaire.publications_retriever.util.url.DataToBeLogged; import eu.openaire.publications_retriever.util.url.LoaderAndChecker; @@ -144,13 +145,13 @@ public static void handleStoreDocFileDirectory() if ( !dir.exists() ) { if ( !dir.mkdirs() ) { // Try to create the directory(-ies) if they don't exist. If they exist OR if sth went wrong, the result is the same: "false". String errorMessage; - if ( PublicationsRetriever.docFilesStorageGivenByUser ) + if ( ArgsUtils.docFilesStorageGivenByUser ) errorMessage = "Problem when creating the \"storeDocFilesDir\": \"" + FileUtils.storeDocFilesDir + "\"." + "\nPlease give a valid Directory-path."; else // User has left the storageDir to be the default one. errorMessage = "Problem when creating the default \"storeDocFilesDir\": \"" + FileUtils.storeDocFilesDir + "\"." + "\nPlease verify you have the necessary privileges in the directory you are running the program from or specify the directory you want to save the files to." - + "\nIf the above is not an option, then you can set to retrieve just the " + PublicationsRetriever.targetUrlType + "s and download the full-texts later (on your own)."; + + "\nIf the above is not an option, then you can set to retrieve just the " + ArgsUtils.targetUrlType + "s and download the full-texts later (on your own)."; System.err.println(errorMessage); logger.error(errorMessage); FileUtils.closeIO(); diff --git a/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java b/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java index 99caa80..9ff2cb9 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java +++ b/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java @@ -9,6 +9,7 @@ import eu.openaire.publications_retriever.exceptions.DocFileNotRetrievedException; import eu.openaire.publications_retriever.exceptions.DocLinkFoundException; import eu.openaire.publications_retriever.exceptions.DomainBlockedException; +import eu.openaire.publications_retriever.util.args.ArgsUtils; import eu.openaire.publications_retriever.util.file.DocFileData; import eu.openaire.publications_retriever.util.file.FileUtils; import eu.openaire.publications_retriever.util.file.IdUrlTuple; @@ -1157,7 +1158,7 @@ public static boolean haveOnlyProtocolDifference(String url1, String url2) public static InputStream getInputStreamFromInputDataUrl() { - if ( (PublicationsRetriever.inputDataUrl == null) || PublicationsRetriever.inputDataUrl.isEmpty() ) { + if ( (ArgsUtils.inputDataUrl == null) || ArgsUtils.inputDataUrl.isEmpty() ) { String errorMessage = "The \"inputDataUrl\" was not given, even though"; logger.error(errorMessage); System.err.println(errorMessage); @@ -1167,7 +1168,7 @@ public static InputStream getInputStreamFromInputDataUrl() InputStream inputStream = null; try { - HttpURLConnection conn = HttpConnUtils.handleConnection(null, PublicationsRetriever.inputDataUrl, PublicationsRetriever.inputDataUrl, PublicationsRetriever.inputDataUrl, null, true, true); + HttpURLConnection conn = HttpConnUtils.handleConnection(null, ArgsUtils.inputDataUrl, ArgsUtils.inputDataUrl, ArgsUtils.inputDataUrl, null, true, true); String mimeType = conn.getHeaderField("Content-Type"); if ( (mimeType == null) || !mimeType.toLowerCase().contains("json") ) { String errorMessage = "The mimeType of the url was either null or a non-json: " + mimeType; diff --git a/src/main/java/eu/openaire/publications_retriever/util/http/HttpConnUtils.java b/src/main/java/eu/openaire/publications_retriever/util/http/HttpConnUtils.java index 000ad0f..b6abb75 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/http/HttpConnUtils.java +++ b/src/main/java/eu/openaire/publications_retriever/util/http/HttpConnUtils.java @@ -1,9 +1,9 @@ package eu.openaire.publications_retriever.util.http; -import eu.openaire.publications_retriever.PublicationsRetriever; import eu.openaire.publications_retriever.crawler.PageCrawler; import eu.openaire.publications_retriever.crawler.SpecialUrlsHandler; import eu.openaire.publications_retriever.exceptions.*; +import eu.openaire.publications_retriever.util.args.ArgsUtils; import eu.openaire.publications_retriever.util.file.DocFileData; import eu.openaire.publications_retriever.util.file.FileUtils; import eu.openaire.publications_retriever.util.url.LoaderAndChecker; @@ -186,7 +186,7 @@ else if ( LoaderAndChecker.retrieveDatasets && returnedType.equals("dataset") ) else if ( calledForPageUrl ) { // Visit this url only if this method was called for an inputUrl. if ( finalUrlStr.contains("viewcontent.cgi") ) { // If this "viewcontent.cgi" isn't a docUrl, then don't check its internalLinks. Check this: "https://docs.lib.purdue.edu/cgi/viewcontent.cgi?referer=&httpsredir=1¶ms=/context/physics_articles/article/1964/type/native/&path_info=" logger.warn("Unwanted pageUrl: \"" + finalUrlStr + "\" will not be visited!"); - UrlUtils.logOutputData(urlId, sourceUrl, pageUrl, UrlUtils.unreachableDocOrDatasetUrlIndicator, "It was discarded in 'HttpConnUtils.connectAndCheckMimeType()', after matching to a non-" + PublicationsRetriever.targetUrlType + " with 'viewcontent.cgi'.", null, true, "true", "true", "false", "false", "false", null, "null"); + UrlUtils.logOutputData(urlId, sourceUrl, pageUrl, UrlUtils.unreachableDocOrDatasetUrlIndicator, "It was discarded in 'HttpConnUtils.connectAndCheckMimeType()', after matching to a non-" + ArgsUtils.targetUrlType + " with 'viewcontent.cgi'.", null, true, "true", "true", "false", "false", "false", null, "null"); UrlTypeChecker.pagesNotProvidingDocUrls.incrementAndGet(); return false; } @@ -195,7 +195,7 @@ else if ( (lowerCaseMimeType != null) && ((lowerCaseMimeType.contains("htm") || PageCrawler.visit(urlId, sourceUrl, finalUrlStr, mimeType, conn, firstHtmlLine, bufferedReader); else { logger.warn("Non-pageUrl: \"" + finalUrlStr + "\" with mimeType: \"" + mimeType + "\" will not be visited!"); - UrlUtils.logOutputData(urlId, sourceUrl, pageUrl, UrlUtils.unreachableDocOrDatasetUrlIndicator, "It was discarded in 'HttpConnUtils.connectAndCheckMimeType()', after not matching to a " + PublicationsRetriever.targetUrlType + " nor to an htm/text-like page.", null, true, "true", "true", "false", "false", "false", null, "null"); + UrlUtils.logOutputData(urlId, sourceUrl, pageUrl, UrlUtils.unreachableDocOrDatasetUrlIndicator, "It was discarded in 'HttpConnUtils.connectAndCheckMimeType()', after not matching to a " + ArgsUtils.targetUrlType + " nor to an htm/text-like page.", null, true, "true", "true", "false", "false", "false", null, "null"); if ( ConnSupportUtils.countAndBlockDomainAfterTimes(blacklistedDomains, timesDomainsHadInputNotBeingDocNorPage, domainStr, HttpConnUtils.timesToHaveNoDocNorPageInputBeforeBlocked, true) ) logger.warn("Domain: \"" + domainStr + "\" was blocked after having no Doc nor Pages in the input more than " + HttpConnUtils.timesToHaveNoDocNorPageInputBeforeBlocked + " times."); } // We log the quadruple here, as there is connection-kind-of problem here.. it's just us considering it an unwanted case. We don't throw "DomainBlockedException()", as we don't handle it for inputUrls (it would also log the quadruple twice with diff comments). diff --git a/src/main/java/eu/openaire/publications_retriever/util/url/DataToBeLogged.java b/src/main/java/eu/openaire/publications_retriever/util/url/DataToBeLogged.java index 00d51f1..392a36f 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/url/DataToBeLogged.java +++ b/src/main/java/eu/openaire/publications_retriever/util/url/DataToBeLogged.java @@ -1,6 +1,6 @@ package eu.openaire.publications_retriever.util.url; -import eu.openaire.publications_retriever.PublicationsRetriever; +import eu.openaire.publications_retriever.util.args.ArgsUtils; import org.apache.commons.lang3.StringUtils; import org.json.JSONException; import org.json.JSONObject; @@ -92,7 +92,7 @@ public String toJsonString() jsonObject.put("id", this.urlId); } jsonObject.put("sourceUrl", this.sourceUrl); - jsonObject.put(PublicationsRetriever.targetUrlType, this.docOrDatasetUrl); + jsonObject.put(ArgsUtils.targetUrlType, this.docOrDatasetUrl); jsonObject.put("wasUrlChecked", this.wasUrlChecked); jsonObject.put("wasUrlValid", this.wasUrlValid); jsonObject.put("wasDocumentOrDatasetAccessible", this.wasDocumentOrDatasetAccessible); diff --git a/src/main/java/eu/openaire/publications_retriever/util/url/LoaderAndChecker.java b/src/main/java/eu/openaire/publications_retriever/util/url/LoaderAndChecker.java index 1581232..5952c3f 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/url/LoaderAndChecker.java +++ b/src/main/java/eu/openaire/publications_retriever/util/url/LoaderAndChecker.java @@ -6,6 +6,7 @@ import eu.openaire.publications_retriever.exceptions.ConnTimeoutException; import eu.openaire.publications_retriever.exceptions.DomainBlockedException; import eu.openaire.publications_retriever.exceptions.DomainWithUnsupportedHEADmethodException; +import eu.openaire.publications_retriever.util.args.ArgsUtils; import eu.openaire.publications_retriever.util.file.FileUtils; import eu.openaire.publications_retriever.util.http.ConnSupportUtils; import eu.openaire.publications_retriever.util.http.HttpConnUtils; @@ -108,7 +109,7 @@ public static void loadAndCheckUrls() throws RuntimeException else isFirstRun = false; - logger.info("Batch counter: " + (++batchCount) + ((PublicationsRetriever.inputFileFullPath != null) ? (" | progress: " + PublicationsRetriever.df.format(((batchCount-1) * FileUtils.jsonBatchSize) * 100.0 / FileUtils.numOfLines) + "%") : "") + " | every batch contains at most " + FileUtils.jsonBatchSize + " id-url pairs."); + logger.info("Batch counter: " + (++batchCount) + ((ArgsUtils.inputFileFullPath != null) ? (" | progress: " + PublicationsRetriever.df.format(((batchCount-1) * FileUtils.jsonBatchSize) * 100.0 / FileUtils.numOfLines) + "%") : "") + " | every batch contains at most " + FileUtils.jsonBatchSize + " id-url pairs."); for ( String retrievedUrl : loadedUrlGroup ) { @@ -179,7 +180,7 @@ public static void loadAndCheckIdUrlPairs() throws RuntimeException else isFirstRun = false; - logger.info("Batch counter: " + (++batchCount) + ((PublicationsRetriever.inputFileFullPath != null) ? (" | progress: " + PublicationsRetriever.df.format(((batchCount-1) * FileUtils.jsonBatchSize) * 100.0 / FileUtils.numOfLines) + "%") : "") + " | every batch contains at most " + FileUtils.jsonBatchSize + " id-url pairs."); + logger.info("Batch counter: " + (++batchCount) + ((ArgsUtils.inputFileFullPath != null) ? (" | progress: " + PublicationsRetriever.df.format(((batchCount-1) * FileUtils.jsonBatchSize) * 100.0 / FileUtils.numOfLines) + "%") : "") + " | every batch contains at most " + FileUtils.jsonBatchSize + " id-url pairs."); Set keys = loadedIdUrlPairs.keySet(); numOfIDs += keys.size(); @@ -332,7 +333,7 @@ public static void loadAndCheckEachIdUrlPairInEntries() throws RuntimeException else isFirstRun = false; - logger.info("Batch counter: " + (++batchCount) + ((PublicationsRetriever.inputFileFullPath != null) ? (" | progress: " + PublicationsRetriever.df.format(((batchCount-1) * FileUtils.jsonBatchSize) * 100.0 / FileUtils.numOfLines) + "%") : "") + " | every batch contains at most " + FileUtils.jsonBatchSize + " id-url pairs."); + logger.info("Batch counter: " + (++batchCount) + ((ArgsUtils.inputFileFullPath != null) ? (" | progress: " + PublicationsRetriever.df.format(((batchCount-1) * FileUtils.jsonBatchSize) * 100.0 / FileUtils.numOfLines) + "%") : "") + " | every batch contains at most " + FileUtils.jsonBatchSize + " id-url pairs."); Set> pairs = loadedIdUrlPairs.entries(); numOfIDs += pairs.size(); @@ -412,7 +413,7 @@ public static void loadAndCheckEachIdUrlPair() throws RuntimeException else isFirstRun = false; - logger.info("Batch counter: " + (++batchCount) + ((PublicationsRetriever.inputFileFullPath != null) ? (" | progress: " + PublicationsRetriever.df.format(((batchCount-1) * FileUtils.jsonBatchSize) * 100.0 / FileUtils.numOfLines) + "%") : "") + " | every batch contains at most " + FileUtils.jsonBatchSize + " id-url pairs."); + logger.info("Batch counter: " + (++batchCount) + ((ArgsUtils.inputFileFullPath != null) ? (" | progress: " + PublicationsRetriever.df.format(((batchCount-1) * FileUtils.jsonBatchSize) * 100.0 / FileUtils.numOfLines) + "%") : "") + " | every batch contains at most " + FileUtils.jsonBatchSize + " id-url pairs."); for ( String retrievedId : loadedIdUrlPairs.keySet() ) { diff --git a/src/test/java/eu/openaire/publications_retriever/test/TestNonStandardInputOutput.java b/src/test/java/eu/openaire/publications_retriever/test/TestNonStandardInputOutput.java index 834dd3c..85ee2ad 100644 --- a/src/test/java/eu/openaire/publications_retriever/test/TestNonStandardInputOutput.java +++ b/src/test/java/eu/openaire/publications_retriever/test/TestNonStandardInputOutput.java @@ -2,6 +2,7 @@ import eu.openaire.publications_retriever.PublicationsRetriever; import eu.openaire.publications_retriever.crawler.MachineLearning; +import eu.openaire.publications_retriever.util.args.ArgsUtils; import eu.openaire.publications_retriever.util.file.FileUtils; import eu.openaire.publications_retriever.util.http.ConnSupportUtils; import eu.openaire.publications_retriever.util.signal.SignalUtils; @@ -50,8 +51,8 @@ public static void setTypeOfInputData() if ( !LoaderAndChecker.useIdUrlPairs ) FileUtils.skipFirstRow = false; // Use "true", if we have a "column-name" in our csv file. Default: "false". - if ( PublicationsRetriever.inputFromUrl ) - logger.info("Using the inputFile from URL: \"" + PublicationsRetriever.inputDataUrl + "\" and the outputFile: \"" + outputFile.getName() + "\"."); + if ( ArgsUtils.inputFromUrl ) + logger.info("Using the inputFile from URL: \"" + ArgsUtils.inputDataUrl + "\" and the outputFile: \"" + outputFile.getName() + "\"."); else logger.info("Using the inputFile: \"" + inputFile.getName() + "\" and the outputFile: \"" + outputFile.getName() + "\"."); } @@ -175,7 +176,7 @@ public static void main( String[] args ) PublicationsRetriever.startTime = Instant.now(); - PublicationsRetriever.parseArgs(args); + ArgsUtils.parseArgs(args); if ( ! GenericUtils.checkInternetConnectivity() ) { FileUtils.closeIO(); @@ -197,16 +198,16 @@ public static void main( String[] args ) if ( MachineLearning.useMLA ) new MachineLearning(); - if ( PublicationsRetriever.workerThreadsCount == 0 ) { // If the user did not provide the "workerThreadsCount", then get the available number from the system. + if ( ArgsUtils.workerThreadsCount == 0 ) { // If the user did not provide the "workerThreadsCount", then get the available number from the system. int availableThreads = Runtime.getRuntime().availableProcessors(); - availableThreads *= PublicationsRetriever.threadsMultiplier; + availableThreads *= ArgsUtils.threadsMultiplier; // If the domains of the urls in the inputFile, are in "uniform distribution" (each one of them to be equally likely to appear in any place), then the more threads the better (triple the computer's number) // Else, if there are far lees domains or/and closely placed inside the inputFile.. then use only the number of threads provided by the computer, since the "politenessDelay" will block them more than the I/O would ever do.. - PublicationsRetriever.workerThreadsCount = availableThreads; // Due to I/O, blocking the threads all the time, more threads handle the workload faster.. + ArgsUtils.workerThreadsCount = availableThreads; // Due to I/O, blocking the threads all the time, more threads handle the workload faster.. } - logger.info("Use " + PublicationsRetriever.workerThreadsCount + " worker-threads."); - PublicationsRetriever.executor = Executors.newFixedThreadPool(PublicationsRetriever.workerThreadsCount); + logger.info("Use " + ArgsUtils.workerThreadsCount + " worker-threads."); + PublicationsRetriever.executor = Executors.newFixedThreadPool(ArgsUtils.workerThreadsCount); try { new LoaderAndChecker(); @@ -247,13 +248,13 @@ public static void setInputOutput() { try { // Check if the user gave the input file in the commandLineArgument, if not, then check for other options. - if ( PublicationsRetriever.inputStream == null ) { - if ( PublicationsRetriever.inputFromUrl ) - PublicationsRetriever.inputStream = ConnSupportUtils.getInputStreamFromInputDataUrl(); + if ( ArgsUtils.inputStream == null ) { + if ( ArgsUtils.inputFromUrl ) + ArgsUtils.inputStream = ConnSupportUtils.getInputStreamFromInputDataUrl(); else - PublicationsRetriever.inputStream = new BufferedInputStream(new FileInputStream(inputFile), FileUtils.fiveMb); + ArgsUtils.inputStream = new BufferedInputStream(new FileInputStream(inputFile), FileUtils.fiveMb); } else { - try ( Stream linesStream = Files.lines(Paths.get(PublicationsRetriever.inputFileFullPath), StandardCharsets.UTF_8) ) { + try ( Stream linesStream = Files.lines(Paths.get(ArgsUtils.inputFileFullPath), StandardCharsets.UTF_8) ) { FileUtils.numOfLines = linesStream.count(); logger.info("The numOfLines in the inputFile is " + FileUtils.numOfLines); } catch (IOException ioe) { @@ -261,16 +262,16 @@ public static void setInputOutput() } } - if ( PublicationsRetriever.inputFileFullPath != null ) { // If the user gave the inputFile as a cmd-arg.. + if ( ArgsUtils.inputFileFullPath != null ) { // If the user gave the inputFile as a cmd-arg.. // Extract the path and the file-name. Do a split in reverse order. String path = null; String inputFileName = null; char separatorChar = File.separator.charAt(0); // The "inputFileFullPath" is guaranteed to have at least one "separator". - for ( int i = PublicationsRetriever.inputFileFullPath.length() -1; i >= 0 ; --i ) { - if ( PublicationsRetriever.inputFileFullPath.charAt(i) == separatorChar ) { + for ( int i = ArgsUtils.inputFileFullPath.length() -1; i >= 0 ; --i ) { + if ( ArgsUtils.inputFileFullPath.charAt(i) == separatorChar ) { i++; // The following methods need the increased < i > - path = PublicationsRetriever.inputFileFullPath.substring(0, i); - inputFileName = PublicationsRetriever.inputFileFullPath.substring(i); + path = ArgsUtils.inputFileFullPath.substring(0, i); + inputFileName = ArgsUtils.inputFileFullPath.substring(i); break; } } @@ -278,7 +279,7 @@ public static void setInputOutput() outputFile = new File(path + "results_" + inputFileName); } - new FileUtils(PublicationsRetriever.inputStream, new BufferedOutputStream(new FileOutputStream(outputFile), FileUtils.fiveMb)); + new FileUtils(ArgsUtils.inputStream, new BufferedOutputStream(new FileOutputStream(outputFile), FileUtils.fiveMb)); setTypeOfInputData();