Skip to content

Commit

Permalink
Refactor cmd-arguments-handling.
Browse files Browse the repository at this point in the history
  • Loading branch information
LSmyrnaios committed Nov 6, 2024
1 parent 50a59d5 commit 5a4938a
Show file tree
Hide file tree
Showing 9 changed files with 324 additions and 256 deletions.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package eu.openaire.publications_retriever.crawler;

import eu.openaire.publications_retriever.PublicationsRetriever;
import eu.openaire.publications_retriever.exceptions.*;
import eu.openaire.publications_retriever.util.args.ArgsUtils;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import eu.openaire.publications_retriever.util.http.HttpConnUtils;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
Expand Down Expand Up @@ -248,7 +248,7 @@ private static void handlePageWithNoDocUrls(String urlId, String sourceUrl, Stri

UrlTypeChecker.pagesNotProvidingDocUrls.incrementAndGet();
if ( !isAlreadyLoggedToOutput ) // This check is used in error-cases, where we have already logged the Quadruple.
UrlUtils.logOutputData(urlId, sourceUrl, pageUrl, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Logged in 'PageCrawler.visit()' method, as no " + PublicationsRetriever.targetUrlType + " was found inside.", null, true, "true", "true", "false", "false", "false", null, "null");
UrlUtils.logOutputData(urlId, sourceUrl, pageUrl, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Logged in 'PageCrawler.visit()' method, as no " + ArgsUtils.targetUrlType + " was found inside.", null, true, "true", "true", "false", "false", "false", null, "null");

if ( ConnSupportUtils.countAndBlockDomainAfterTimes(HttpConnUtils.blacklistedDomains, PageCrawler.timesDomainNotGivingDocUrls, pageDomain, PageCrawler.timesToGiveNoDocUrlsBeforeBlocked, true) )
logger.warn("Domain: \"" + pageDomain + "\" was blocked after giving no docUrls more than " + PageCrawler.timesToGiveNoDocUrlsBeforeBlocked + " times.");
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,264 @@
package eu.openaire.publications_retriever.util.args;


import eu.openaire.publications_retriever.util.file.FileUtils;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.util.Arrays;


/**
* @author Lampros Smyrnaios
*/
public class ArgsUtils {

private static final Logger logger = LoggerFactory.getLogger(ArgsUtils.class);

public static int initialNumOfDocFile = 0;

public static boolean docFilesStorageGivenByUser = false;

public static boolean inputFromUrl = false;
public static String inputDataUrl = null;

public static InputStream inputStream = null;
public static String inputFileFullPath = null;

public static String targetUrlType = "docOrDatasetUrl"; // docUrl, documentUrl, docOrDatasetUrl ; this is set by the args-parser, and it's used when outputting data.

public static int workerThreadsCount = 0;
public static int threadsMultiplier = 2; // Use *3 without downloading docFiles and when having the domains to appear in uniform distribution in the inputFile. Use *2 when downloading.

private static final String usageMessage = "\nUsage: java -jar publications_retriever-<VERSION>.jar -retrieveDataType <dataType: document | dataset | all> -inputFileFullPath inputFile -downloadDocFiles(OPTIONAL) -docFileNameType(OPTIONAL) <nameType: originalName | idName | numberName> -firstDocFileNum(OPTIONAL) 'num' -docFilesStorage(OPTIONAL) 'storageDir' -inputDataUrl(OPTIONAL) 'inputUrl' -numOfThreads(OPTIONAL) 'threadsNum' < 'input' > 'output'";

private static boolean firstNumGiven = false;


public static void parseArgs(String[] mainArgs)
{
if ( mainArgs.length > 15 ) {
String errMessage = "\"PublicationsRetriever\" expected only up to 15 arguments, while you gave: " + mainArgs.length + "!" + usageMessage;
logger.error(errMessage);
System.err.println(errMessage);
System.exit(-1);
}

for ( short i = 0; i < mainArgs.length; i++ )
{
try {
switch ( mainArgs[i] ) {
case "-retrieveDataType":
i ++;
handleDatatypeArg(mainArgs[i]);
break;
case "-inputFileFullPath":
i ++;
handleFilePathArg(mainArgs[i]);
break;
case "-downloadDocFiles":
FileUtils.shouldDownloadDocFiles = true;
break;
case "-docFileNameType":
i ++;
handleFileNameType(mainArgs[i]);
break;
case "-firstDocFileNum":
i ++;
handleFirstDocFileNum(mainArgs[i]);
break;
case "-docFilesStorage":
i ++;
handleDocFilesStorage(mainArgs[i]);
break;
case "-inputDataUrl":
i++;
inputDataUrl = mainArgs[i];
inputFromUrl = true;
logger.info("Using the inputFile from the URL: " + inputDataUrl);
break;
case "-numOfThreads":
i++;
handleNumThreads(mainArgs[i]);
break;
default: // log & ignore the argument
String errMessage = "Argument: \"" + mainArgs[i] + "\" was not expected!" + usageMessage;
System.err.println(errMessage);
logger.error(errMessage);
break;
}
} catch (ArrayIndexOutOfBoundsException aioobe) {
String errMessage = "The argument-set of \"" + mainArgs[i] + "\" was not complete!\nThe provided arguments are: " + Arrays.toString(mainArgs) + usageMessage;
System.err.println(errMessage);
logger.error(errMessage);
System.exit(90);
}
}

if ( FileUtils.shouldDownloadDocFiles )
handleDownloadCase();
}


private static void handleDatatypeArg(String dataType)
{
//String dataType = mainArgs[i];
switch (dataType) {
case "document":
logger.info("Going to retrieve only records of \"document\"-type.");
LoaderAndChecker.retrieveDocuments = true;
LoaderAndChecker.retrieveDatasets = false;
targetUrlType = "docUrl";
break;
case "dataset":
logger.info("Going to retrieve only records of \"dataset\"-type.");
LoaderAndChecker.retrieveDocuments = false;
LoaderAndChecker.retrieveDatasets = true;
targetUrlType = "datasetUrl";
break;
case "all":
logger.info("Going to retrieve records of all types (documents and datasets).");
LoaderAndChecker.retrieveDocuments = true;
LoaderAndChecker.retrieveDatasets = true;
targetUrlType = "docOrDatasetUrl";
break;
default:
String errMessage = "Argument: \"" + dataType + "\" was invalid!\nExpected one of the following: \"docFiles | datasets | all\"" + usageMessage;
System.err.println(errMessage);
logger.error(errMessage);
System.exit(9);
}
}


private static void handleFilePathArg(String filePathArg)
{
inputFileFullPath = filePathArg;
if ( !(inputFileFullPath.startsWith(File.separator) || inputFileFullPath.startsWith("~")) )
{
if ( inputFileFullPath.startsWith("." + File.separator) ) // Remove the starting "dot" + "/" or "\", if exists.
inputFileFullPath = StringUtils.replace(inputFileFullPath, "." + File.separator, "", 1);

inputFileFullPath = System.getProperty("user.dir") + File.separator + inputFileFullPath; // In case the given path starts with "..", then this also works.
}
try {
inputStream = new BufferedInputStream(new FileInputStream(inputFileFullPath), FileUtils.fiveMb);
} catch (FileNotFoundException fnfe) {
String errMessage = "No inputFile was found in \"" + inputFileFullPath + "\"";
logger.error(errMessage);
System.err.println(errMessage);
System.exit(-144);
} catch (Exception e) {
String errMessage = e.toString();
logger.error(errMessage);
System.err.println(errMessage);
System.exit(-145);
}
}


private static void handleFileNameType(String nameType)
{
switch ( nameType ) {
case "originalName":
logger.info("Going to use the \"originalName\" type.");
FileUtils.docFileNameType = FileUtils.DocFileNameType.originalName;
break;
case "idName":
if ( !LoaderAndChecker.useIdUrlPairs ) {
String errMessage = "You provided the \"DocFileNameType.idName\", but the program's reader is not set to retrieve IDs from the inputFile! Set the program to retrieve IDs by setting the \"utils.url.LoaderAndChecker.useIdUrlPairs\"-variable to \"true\".";
System.err.println(errMessage);
logger.error(errMessage);
System.exit(10);
} else {
logger.info("Going to use the \"idName\" type.");
FileUtils.docFileNameType = FileUtils.DocFileNameType.idName;
}
break;
case "numberName":
logger.info("Going to use the \"numberName\" type.");
FileUtils.docFileNameType = FileUtils.DocFileNameType.numberName;
break;
default:
String errMessage = "Invalid \"docFileNameType\" given (\"" + nameType + "\")\nExpected one of the following: \"originalName | idName | numberName\"" + usageMessage;
System.err.println(errMessage);
logger.error(errMessage);
System.exit(11);
}
}


private static void handleFirstDocFileNum(String initNumStr)
{
try {
FileUtils.numOfDocFile = initialNumOfDocFile = Integer.parseInt(initNumStr); // We use both variables in statistics.
if ( initialNumOfDocFile <= 0 ) {
logger.warn("The given \"initialNumOfDocFile\" (" + initialNumOfDocFile + ") was a number less or equal to zero! Setting that number to <1> and continuing downloading..");
initialNumOfDocFile = 1;
}
firstNumGiven = true;
} catch (NumberFormatException nfe) {
String errorMessage = "Argument \"-firstDocFileNum\" must be followed by an integer value! Given one was: \"" + initNumStr + "\"" + usageMessage;
System.err.println(errorMessage);
logger.error(errorMessage);
System.exit(-2);
}
}


private static void handleDocFilesStorage(String docStorageDir)
{
docFilesStorageGivenByUser = true;
if ( docStorageDir.equals("S3ObjectStore") )
FileUtils.shouldUploadFilesToS3 = true;
else
FileUtils.storeDocFilesDir = docStorageDir + (!docStorageDir.endsWith(File.separator) ? File.separator : ""); // Pre-process it.. otherwise, it may cause problems.
}


private static void handleNumThreads(String workerCountString)
{
try {
workerThreadsCount = initialNumOfDocFile = Integer.parseInt(workerCountString); // We use both variables in statistics.
if ( workerThreadsCount < 1 ) {
logger.warn("The \"workerThreadsCount\" given was less than < 1 > (" + workerThreadsCount + "), continuing with < 1 > instead..");
workerThreadsCount = 1;
}
} catch (NumberFormatException nfe) {
logger.error("Invalid \"workerThreadsCount\" was given: \"" + workerCountString + "\".\tContinue by using the system's available threads multiplied by " + threadsMultiplier);
}
}


private static void handleDownloadCase()
{
if ( FileUtils.docFileNameType == null ) {
logger.warn("You did not specified the docNameType!" + usageMessage);
if ( LoaderAndChecker.useIdUrlPairs ) {
FileUtils.docFileNameType = FileUtils.DocFileNameType.idName;
logger.warn("The program will use the \"idName\"-type!");
} else {
FileUtils.docFileNameType = FileUtils.DocFileNameType.numberName;
logger.warn("The program will use the \"numberName\"-type!");
}
}

if ( FileUtils.shouldUploadFilesToS3 && FileUtils.docFileNameType.equals(FileUtils.DocFileNameType.originalName) ) {
String baseMsg = "The uploading of the docFiles to the S3-ObjectStore requires the use of \"ID-names\" or \"Number-names\" for the DocFiles. You specified the \"originalName\" DocFileNameType.";
if ( LoaderAndChecker.useIdUrlPairs ) {
logger.warn(baseMsg + " Replacing the DocFileNameType \"originalName\" with \"idName\".");
FileUtils.docFileNameType = FileUtils.DocFileNameType.idName;
} else {
logger.warn(baseMsg + " Replacing the DocFileNameType \"originalName\" with \"numberName\".");
FileUtils.docFileNameType = FileUtils.DocFileNameType.numberName;
}
}

if ( firstNumGiven && !FileUtils.docFileNameType.equals(FileUtils.DocFileNameType.numberName) )
logger.warn("You provided the \"-firstDocFileNum\" a, but you also specified a \"docFileNameType\" of non numeric-type. The \"-firstDocFileNum\" will be ignored!" + usageMessage);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import eu.openaire.publications_retriever.PublicationsRetriever;
import eu.openaire.publications_retriever.crawler.MachineLearning;
import eu.openaire.publications_retriever.exceptions.DocFileNotRetrievedException;
import eu.openaire.publications_retriever.util.args.ArgsUtils;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import eu.openaire.publications_retriever.util.url.DataToBeLogged;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
Expand Down Expand Up @@ -144,13 +145,13 @@ public static void handleStoreDocFileDirectory()
if ( !dir.exists() ) {
if ( !dir.mkdirs() ) { // Try to create the directory(-ies) if they don't exist. If they exist OR if sth went wrong, the result is the same: "false".
String errorMessage;
if ( PublicationsRetriever.docFilesStorageGivenByUser )
if ( ArgsUtils.docFilesStorageGivenByUser )
errorMessage = "Problem when creating the \"storeDocFilesDir\": \"" + FileUtils.storeDocFilesDir + "\"."
+ "\nPlease give a valid Directory-path.";
else // User has left the storageDir to be the default one.
errorMessage = "Problem when creating the default \"storeDocFilesDir\": \"" + FileUtils.storeDocFilesDir + "\"."
+ "\nPlease verify you have the necessary privileges in the directory you are running the program from or specify the directory you want to save the files to."
+ "\nIf the above is not an option, then you can set to retrieve just the " + PublicationsRetriever.targetUrlType + "s and download the full-texts later (on your own).";
+ "\nIf the above is not an option, then you can set to retrieve just the " + ArgsUtils.targetUrlType + "s and download the full-texts later (on your own).";
System.err.println(errorMessage);
logger.error(errorMessage);
FileUtils.closeIO();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import eu.openaire.publications_retriever.exceptions.DocFileNotRetrievedException;
import eu.openaire.publications_retriever.exceptions.DocLinkFoundException;
import eu.openaire.publications_retriever.exceptions.DomainBlockedException;
import eu.openaire.publications_retriever.util.args.ArgsUtils;
import eu.openaire.publications_retriever.util.file.DocFileData;
import eu.openaire.publications_retriever.util.file.FileUtils;
import eu.openaire.publications_retriever.util.file.IdUrlTuple;
Expand Down Expand Up @@ -1157,7 +1158,7 @@ public static boolean haveOnlyProtocolDifference(String url1, String url2)

public static InputStream getInputStreamFromInputDataUrl()
{
if ( (PublicationsRetriever.inputDataUrl == null) || PublicationsRetriever.inputDataUrl.isEmpty() ) {
if ( (ArgsUtils.inputDataUrl == null) || ArgsUtils.inputDataUrl.isEmpty() ) {
String errorMessage = "The \"inputDataUrl\" was not given, even though";
logger.error(errorMessage);
System.err.println(errorMessage);
Expand All @@ -1167,7 +1168,7 @@ public static InputStream getInputStreamFromInputDataUrl()

InputStream inputStream = null;
try {
HttpURLConnection conn = HttpConnUtils.handleConnection(null, PublicationsRetriever.inputDataUrl, PublicationsRetriever.inputDataUrl, PublicationsRetriever.inputDataUrl, null, true, true);
HttpURLConnection conn = HttpConnUtils.handleConnection(null, ArgsUtils.inputDataUrl, ArgsUtils.inputDataUrl, ArgsUtils.inputDataUrl, null, true, true);
String mimeType = conn.getHeaderField("Content-Type");
if ( (mimeType == null) || !mimeType.toLowerCase().contains("json") ) {
String errorMessage = "The mimeType of the url was either null or a non-json: " + mimeType;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
package eu.openaire.publications_retriever.util.http;

import eu.openaire.publications_retriever.PublicationsRetriever;
import eu.openaire.publications_retriever.crawler.PageCrawler;
import eu.openaire.publications_retriever.crawler.SpecialUrlsHandler;
import eu.openaire.publications_retriever.exceptions.*;
import eu.openaire.publications_retriever.util.args.ArgsUtils;
import eu.openaire.publications_retriever.util.file.DocFileData;
import eu.openaire.publications_retriever.util.file.FileUtils;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
Expand Down Expand Up @@ -186,7 +186,7 @@ else if ( LoaderAndChecker.retrieveDatasets && returnedType.equals("dataset") )
else if ( calledForPageUrl ) { // Visit this url only if this method was called for an inputUrl.
if ( finalUrlStr.contains("viewcontent.cgi") ) { // If this "viewcontent.cgi" isn't a docUrl, then don't check its internalLinks. Check this: "https://docs.lib.purdue.edu/cgi/viewcontent.cgi?referer=&httpsredir=1&params=/context/physics_articles/article/1964/type/native/&path_info="
logger.warn("Unwanted pageUrl: \"" + finalUrlStr + "\" will not be visited!");
UrlUtils.logOutputData(urlId, sourceUrl, pageUrl, UrlUtils.unreachableDocOrDatasetUrlIndicator, "It was discarded in 'HttpConnUtils.connectAndCheckMimeType()', after matching to a non-" + PublicationsRetriever.targetUrlType + " with 'viewcontent.cgi'.", null, true, "true", "true", "false", "false", "false", null, "null");
UrlUtils.logOutputData(urlId, sourceUrl, pageUrl, UrlUtils.unreachableDocOrDatasetUrlIndicator, "It was discarded in 'HttpConnUtils.connectAndCheckMimeType()', after matching to a non-" + ArgsUtils.targetUrlType + " with 'viewcontent.cgi'.", null, true, "true", "true", "false", "false", "false", null, "null");
UrlTypeChecker.pagesNotProvidingDocUrls.incrementAndGet();
return false;
}
Expand All @@ -195,7 +195,7 @@ else if ( (lowerCaseMimeType != null) && ((lowerCaseMimeType.contains("htm") ||
PageCrawler.visit(urlId, sourceUrl, finalUrlStr, mimeType, conn, firstHtmlLine, bufferedReader);
else {
logger.warn("Non-pageUrl: \"" + finalUrlStr + "\" with mimeType: \"" + mimeType + "\" will not be visited!");
UrlUtils.logOutputData(urlId, sourceUrl, pageUrl, UrlUtils.unreachableDocOrDatasetUrlIndicator, "It was discarded in 'HttpConnUtils.connectAndCheckMimeType()', after not matching to a " + PublicationsRetriever.targetUrlType + " nor to an htm/text-like page.", null, true, "true", "true", "false", "false", "false", null, "null");
UrlUtils.logOutputData(urlId, sourceUrl, pageUrl, UrlUtils.unreachableDocOrDatasetUrlIndicator, "It was discarded in 'HttpConnUtils.connectAndCheckMimeType()', after not matching to a " + ArgsUtils.targetUrlType + " nor to an htm/text-like page.", null, true, "true", "true", "false", "false", "false", null, "null");
if ( ConnSupportUtils.countAndBlockDomainAfterTimes(blacklistedDomains, timesDomainsHadInputNotBeingDocNorPage, domainStr, HttpConnUtils.timesToHaveNoDocNorPageInputBeforeBlocked, true) )
logger.warn("Domain: \"" + domainStr + "\" was blocked after having no Doc nor Pages in the input more than " + HttpConnUtils.timesToHaveNoDocNorPageInputBeforeBlocked + " times.");
} // We log the quadruple here, as there is connection-kind-of problem here.. it's just us considering it an unwanted case. We don't throw "DomainBlockedException()", as we don't handle it for inputUrls (it would also log the quadruple twice with diff comments).
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package eu.openaire.publications_retriever.util.url;

import eu.openaire.publications_retriever.PublicationsRetriever;
import eu.openaire.publications_retriever.util.args.ArgsUtils;
import org.apache.commons.lang3.StringUtils;
import org.json.JSONException;
import org.json.JSONObject;
Expand Down Expand Up @@ -92,7 +92,7 @@ public String toJsonString()
jsonObject.put("id", this.urlId);
}
jsonObject.put("sourceUrl", this.sourceUrl);
jsonObject.put(PublicationsRetriever.targetUrlType, this.docOrDatasetUrl);
jsonObject.put(ArgsUtils.targetUrlType, this.docOrDatasetUrl);
jsonObject.put("wasUrlChecked", this.wasUrlChecked);
jsonObject.put("wasUrlValid", this.wasUrlValid);
jsonObject.put("wasDocumentOrDatasetAccessible", this.wasDocumentOrDatasetAccessible);
Expand Down
Loading

0 comments on commit 5a4938a

Please sign in to comment.