Merge pull request #17 from LSmyrnaios/dev

Optimizations and bug fixes
LSmyrnaios · Dec 5, 2024 · 9228341 · 9228341
2 parents db57f56 + 37973a4
commit 9228341
Show file tree

Hide file tree

Showing 24 changed files with 380 additions and 300 deletions.
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
@@ -13,12 +13,10 @@ name: "CodeQL"
 
 on:
   push:
-    branches: [ master ]
+    branches: [ master, dev ]
   pull_request:
     # The branches below must be a subset of the branches above
-    branches: [ master ]
-  schedule:
-    - cron: '18 22 * * 1'
+    branches: [ master, dev ]
 
 jobs:
   analyze:

diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml
@@ -5,9 +5,9 @@ name: Java CI with Maven
 
 on:
   push:
-    branches: [ master ]
+    branches: [ master, dev ]
   pull_request:
-    branches: [ master ]
+    branches: [ master, dev ]
 
 jobs:
   build:

diff --git a/README.md b/README.md
@@ -81,7 +81,7 @@ To run the application you should navigate to the ***target*** directory, which
 while choosing the appropriate run-command.<br> 
 
 **Run with standard input/output:**<br>
-**``java -jar publications_retriever-1.2-SNAPSHOT.jar arg1:'-inputFileFullPath' arg2:<inputFile> arg3:'-retrieveDataType' arg4:'<dataType: document | dataset | all>' arg5:'-downloadDocFiles' arg6:'-docFileNameType' arg7:'idName' arg8:'-firstDocFileNum' arg9:'NUM' arg10:'-docFilesStorage'
+**``java -jar publications_retriever-1.2-SNAPSHOT.jar arg1:'-inputFileFullPath' arg2:<inputFile> arg3:'-retrieveDataType' arg4:'<dataType: document | dataset | all>' arg5:'-downloadDocFiles' arg6:'-fileNameType' arg7:'idName' arg8:'-firstFileNum' arg9:'NUM' arg10:'-docFilesStorage'
 arg11:'storageDir' < stdIn:'inputJsonFile' > stdOut:'outputJsonFile'``**<br>
 
 **Run tests with custom input/output:**
@@ -93,7 +93,7 @@ arg11:'storageDir' < stdIn:'inputJsonFile' > stdOut:'outputJsonFile'``**<br>
     and change the ***appender-ref***, from ***File*** to ***Console***.<br>
 - Run ``mvn clean install`` to create the new ***JAR*** file.<br>
 - Execute the program with the following command:<br>
-**``java -jar publications_retriever-1.2-SNAPSHOT.jar arg2:'<dataType: document | dataset | all>' arg3:'-downloadDocFiles' arg4:'-docFileNameType' arg5:'numberName' arg6:'-firstDocFileNum' arg7:'NUM' arg8:'-docFilesStorage' arg9:'storageDir' arg10:'-inputDataUrl' arg11: 'inputUrl' arg12: '-numOfThreads' arg13: <NUM>``**
+**``java -jar publications_retriever-1.2-SNAPSHOT.jar arg2:'<dataType: document | dataset | all>' arg3:'-downloadDocFiles' arg4:'-fileNameType' arg5:'numberName' arg6:'-firstFileNum' arg7:'NUM' arg8:'-docFilesStorage' arg9:'storageDir' arg10:'-inputDataUrl' arg11: 'inputUrl' arg12: '-numOfThreads' arg13: <NUM>``**
 <br><br>
 *You can use the argument '-inputFileFullPath' to define the inputFile, instead of the stdin-redirection. That way, the progress percentage will appear in the logging file.*
 <br><br>
@@ -103,8 +103,8 @@ arg11:'storageDir' < stdIn:'inputJsonFile' > stdOut:'outputJsonFile'``**<br>
 - **-downloadDocFiles** will tell the program to download the DocFiles.
     The absence of this argument will cause the program to NOT download the docFiles, but just to find the *DocUrls* instead.
     Either way the DocUrls will be written to the JsonOutputFile.
-- **-docFileNameType** and **< fileNameType >** will tell the program which fileName-type to use (*originalName, idName, numberName*).
-- **-firstDocFileNum** and **< NUM >** will tell the program to use numbers as *DocFileNames* and the first *DocFile* will have the given number "*NUM*".
+- **-fileNameType** and **< fileNameType >** will tell the program which fileName-type to use (*originalName, idName, numberName*).
+- **-firstFileNum** and **< NUM >** will tell the program to use numbers as *DocFileNames* and the first *DocFile* will have the given number "*NUM*".
     The absence of this argument-group will cause the program to use the original-docFileNames.
 - **-docFilesStorage** and **storageDir** will tell the program to use the given DocFiles-*storageDir*.
     If the *storageDir* is equal to **"S3ObjectStore"** , then the program uploads the DocFiles to an S3 storage (see the **note** below).
@@ -127,15 +127,15 @@ The above script will run the following commands:
 - **`mvn clean install`**: Does a *clean install*.
 - **`rm -rf example/sample_output/*`**: Removes any previous example-results.
 - **``cd target &&
-    java -jar publications_retriever-1.2-SNAPSHOT.jar -retrieveDataType document -downloadDocFiles -docFileNameType numberName -firstDocFileNum 1 -docFilesStorage ../example/sample_output/DocFiles
+    java -jar publications_retriever-1.2-SNAPSHOT.jar -retrieveDataType document -downloadDocFiles -fileNameType numberName -firstFileNum 1 -docFilesStorage ../example/sample_output/DocFiles
     < ../example/sample_input/sample_input.json > ../example/sample_output/sample_output.json``**<br>
     This command will run the program with "**../example/sample_input/sample_input.json**" as input
     and "**../example/sample_output/sample_output.json**" as the output.<br>
     The arguments used are:
     - **-retrieveDataType** and **document** will tell the program to retrieve the urls of type "*document*".
     - **-downloadDocFiles** which will tell the program to download the DocFiles.
-    - **-docFileNameType numberName** which will tell the program to use numbers as the docFileNames.
-    - **-firstDocFileNum 1** which will tell the program to use numbers as DocFileNames and the first DocFile will have the number <*1*>.
+    - **-fileNameType numberName** which will tell the program to use numbers as the docFileNames.
+    - **-firstFileNum 1** which will tell the program to use numbers as DocFileNames and the first DocFile will have the number <*1*>.
     - **-docFilesStorage ../example/sample_output/DocFiles** which will tell the program to use the custom DocFilesStorageDir: "*../example/sample_output/DocFiles*".
 <br>
 

diff --git a/pom.xml b/pom.xml
@@ -114,7 +114,7 @@
     <dependency>
       <groupId>org.jsoup</groupId>
       <artifactId>jsoup</artifactId>
-      <version>1.18.1</version>
+      <version>1.18.2</version>
     </dependency>
 
     <!-- https://mvnrepository.com/artifact/com.google.guava/guava -->
@@ -160,7 +160,7 @@
     <dependency>
       <groupId>commons-io</groupId>
       <artifactId>commons-io</artifactId>
-      <version>2.17.0</version>
+      <version>2.18.0</version>
     </dependency>
 
     <!-- https://mvnrepository.com/artifact/org.json/json -->
@@ -174,7 +174,7 @@
     <dependency>
       <groupId>io.minio</groupId>
       <artifactId>minio</artifactId>
-      <version>8.5.13</version>
+      <version>8.5.14</version>
     </dependency>
 
     <!-- Use this dependency in case we move to Java 11 in the future (or at least use JDK-11 in a VM).. -->

diff --git a/runExample.sh b/runExample.sh
@@ -9,7 +9,7 @@ rm -rf example/sample_output/*
 # Run the program.
 cd target || exit
 
-command="java -jar publications_retriever-1.3-SNAPSHOT.jar -retrieveDataType all -downloadDocFiles -docFileNameType numberName -firstDocFileNum 1 -docFilesStorage ../example/sample_output/DocFiles < ../example/sample_input/sample_input.json > ../example/sample_output/sample_output.json"
+command="java -jar publications_retriever-1.3-SNAPSHOT.jar -retrieveDataType all -downloadDocFiles -fileNameType numberName -firstFileNum 1 -docFilesStorage ../example/sample_output/DocFiles < ../example/sample_input/sample_input.json > ../example/sample_output/sample_output.json"
 echo -e "\nRunning: $command\n"
 eval "$command"
 echo "Finished"
diff --git a/src/main/java/eu/openaire/publications_retriever/PublicationsRetriever.java b/src/main/java/eu/openaire/publications_retriever/PublicationsRetriever.java
@@ -36,7 +36,7 @@
 
 /**
  * This class contains the entry-point of this program, the "main()" method.
- * The "main()" method calls other methods to set the input/output streams and retrieve the docUrls for each docPage in the inputFile.
+ * The "main()" method calls other methods to set the input/output streams and retrieve the docOrDatasetUrls for each docPage in the inputFile.
  * In the end, the outputFile consists of docPages along with their docUrls.
  * @author Lampros Smyrnaios
  */
@@ -70,7 +70,7 @@ public static void main( String[] args )
 
 		logger.info("Starting PublicationsRetriever..");
 		ConnSupportUtils.setKnownMimeTypes();
-		UrlTypeChecker.setURLDirectoryFilterRegex();
+		UrlTypeChecker.setRuntimeInitializedRegexes();
 
 		// Check if the user gave the input file in the commandLineArgument, if not, then check for other options.
 		if ( ArgsUtils.inputStream == null ) {
@@ -171,21 +171,21 @@ public static void showStatistics(Instant startTime)
 			logger.warn("A SIGINT signal was received, so some of the \"checked-urls\" may have not been actually checked, that's more of a number of the \"loaded-urls\".");
 
 		logger.info("Total " + ArgsUtils.targetUrlType + "s found: " + UrlUtils.sumOfDocUrlsFound + ". That's about: " + df.format(UrlUtils.sumOfDocUrlsFound.get() * 100.0 / inputCheckedUrlNum) + "% from the total numOfUrls checked. The rest were problematic or non-handleable url-cases.");
-		if ( FileUtils.shouldDownloadDocFiles ) {
+		if ( ArgsUtils.shouldDownloadDocFiles ) {
 			int numOfStoredDocFiles = 0;
-			if ( !FileUtils.docFileNameType.equals(FileUtils.DocFileNameType.numberName) )	// If we have anything different from the numberName-type..
+			if ( !ArgsUtils.fileNameType.equals(ArgsUtils.fileNameTypeEnum.numberName) )	// If we have anything different from the numberName-type..
 				numOfStoredDocFiles = FileUtils.numOfDocFiles.get();
 			else
-				numOfStoredDocFiles = FileUtils.numOfDocFile - ArgsUtils.initialNumOfDocFile;
+				numOfStoredDocFiles = FileUtils.numOfDocFile - ArgsUtils.initialNumOfFile;
 			logger.info("From which docUrls, we were able to retrieve: " + numOfStoredDocFiles + " distinct docFiles. That's about: " + df.format(numOfStoredDocFiles * 100.0 / UrlUtils.sumOfDocUrlsFound.get()) + "%."
-					+ " The un-retrieved docFiles were either belonging to already-found docUrls or they had connection-issues.");
+					+ " The un-retrieved docFiles were either belonging to already-found " + ArgsUtils.targetUrlType + "s or they had connection-issues or they had problematic content.");
 		}
 		logger.debug("The metaDocUrl-handler is responsible for the discovery of " + MetadataHandler.numOfMetaDocUrlsFound + " docUrls (" + df.format(MetadataHandler.numOfMetaDocUrlsFound.get() * 100.0 / UrlUtils.sumOfDocUrlsFound.get()) + "% of the found docUrls).");
-		logger.debug("The re-crossed docUrls (from all handlers) were " + ConnSupportUtils.reCrossedDocUrls.get() + ". That's about " + df.format(ConnSupportUtils.reCrossedDocUrls.get() * 100.0 / UrlUtils.sumOfDocUrlsFound.get()) + "% of the found docUrls.");
+		logger.debug("The re-crossed " + ArgsUtils.targetUrlType + "s (from all handlers) were " + ConnSupportUtils.reCrossedDocUrls.get() + ". That's about " + df.format(ConnSupportUtils.reCrossedDocUrls.get() * 100.0 / UrlUtils.sumOfDocUrlsFound.get()) + "% of the found docUrls.");
 		if ( MachineLearning.useMLA )
-			logger.debug("The M.L.A. is responsible for the discovery of " + MachineLearning.docUrlsFoundByMLA.get() + " of the docUrls (" + df.format(MachineLearning.docUrlsFoundByMLA.get() * 100.0 / UrlUtils.sumOfDocUrlsFound.get()) + "%). The M.L.A.'s average success-rate was: " + df.format(MachineLearning.getAverageSuccessRate()) + "%. Gathered data for " + MachineLearning.timesGatheredData.get() + " valid pageUrl-docUrl pairs.");
+			logger.debug("The legacy M.L.A. is responsible for the discovery of " + MachineLearning.docUrlsFoundByMLA.get() + " of the " + ArgsUtils.targetUrlType + "s (" + df.format(MachineLearning.docUrlsFoundByMLA.get() * 100.0 / UrlUtils.sumOfDocUrlsFound.get()) + "%). The M.L.A.'s average success-rate was: " + df.format(MachineLearning.getAverageSuccessRate()) + "%. Gathered data for " + MachineLearning.timesGatheredData.get() + " valid pageUrl-docUrl pairs.");
 		else
-			logger.debug("The M.L.A. was not enabled.");
+			logger.debug("The legacy M.L.A. was not enabled.");
 
 		logger.debug("About " + df.format(LoaderAndChecker.connProblematicUrls.get() * 100.0 / inputCheckedUrlNum) + "% (" + LoaderAndChecker.connProblematicUrls.get() + " urls) were pages which had connectivity problems.");
 		logger.debug("About " + df.format(MetadataHandler.numOfProhibitedAccessPagesFound.get() * 100.0 / inputCheckedUrlNum) + "% (" + MetadataHandler.numOfProhibitedAccessPagesFound.get() + " urls) were pages with prohibited access.");
@@ -322,7 +322,7 @@ public static void sortConcurrentHashMapByValueAndPrint(ConcurrentHashMap<String
 			else
 				return o1.getValue().compareTo(o2.getValue());
 		});
-		logger.debug("The " + list.size() + " domains which gave docUrls and their number:");
+		logger.debug("The " + list.size() + " domains which gave " + ArgsUtils.targetUrlType + "s and their number:");
 /*		for ( Map.Entry<String, Integer> entry : list )
 			logger.debug(entry.getKey() + " : " + entry.getValue());*/
 	}

diff --git a/src/main/java/eu/openaire/publications_retriever/crawler/MetadataHandler.java b/src/main/java/eu/openaire/publications_retriever/crawler/MetadataHandler.java
@@ -1,6 +1,7 @@
 package eu.openaire.publications_retriever.crawler;
 
 import eu.openaire.publications_retriever.exceptions.DomainBlockedException;
+import eu.openaire.publications_retriever.util.args.ArgsUtils;
 import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
 import eu.openaire.publications_retriever.util.http.HttpConnUtils;
 import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
@@ -43,12 +44,12 @@ public class MetadataHandler {
         String regex = ".+\\.(?:";
 
         if ( !LoaderAndChecker.retrieveDatasets )
-            regex += "zip|rar|";  // If no datasets retrieved, block these types.
+            regex += LoaderAndChecker.dataset_formats;  // If no datasets retrieved, block these types.
         else if ( !LoaderAndChecker.retrieveDocuments )
-            regex += "pdf|doc[x]?|";  // If no documents retrieved, block these types.
+            regex += "pdf|" + UrlTypeChecker.unsupportedDocFileTypes;  // If no documents retrieved, block these types.
         //else -> no more datatype-dependent additions
 
-        regex += "apk|jpg|png)(?:\\?.+)?$";
+        regex += "|apk|jpg|png)(?:\\?.+)?$";
         logger.debug("COMMON_UNSUPPORTED_META_DOC_OR_DATASET_URL_EXTENSIONS -> REGEX: " + regex);
         COMMON_UNSUPPORTED_META_DOC_OR_DATASET_URL_EXTENSIONS = Pattern.compile(regex);
     }
@@ -133,7 +134,7 @@ public static boolean checkAndHandleMetadata(String urlId, String sourceUrl, Str
 
         String lowerCaseMetaDocUrl = metaDocUrl.toLowerCase();
 
-        if ( UrlTypeChecker.CURRENTLY_UNSUPPORTED_DOC_EXTENSION_FILTER.matcher(lowerCaseMetaDocUrl).matches()
+        if ( (ArgsUtils.shouldDownloadDocFiles && UrlTypeChecker.CURRENTLY_UNSUPPORTED_DOC_EXTENSION_FILTER.matcher(lowerCaseMetaDocUrl).matches())
             || UrlTypeChecker.PLAIN_PAGE_EXTENSION_FILTER.matcher(lowerCaseMetaDocUrl).matches()
             || UrlTypeChecker.URL_DIRECTORY_FILTER.matcher(lowerCaseMetaDocUrl).matches()
             || COMMON_UNSUPPORTED_META_DOC_OR_DATASET_URL_EXTENSIONS.matcher(lowerCaseMetaDocUrl).matches() )   // These do not lead us to avoid crawling the page, since the metaDocUrl may be an image, but the page may also include a full-text inside.

diff --git a/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java b/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java
@@ -599,7 +599,7 @@ public static boolean verifyDocLink(String urlId, String sourceUrl, String pageU
 		//logger.debug("Going to check DocLink: " + docLink);	// DEBUG!
 		try {
 			if ( !HttpConnUtils.connectAndCheckMimeType(urlId, sourceUrl, pageUrl, docLink, null, false, true) ) {    // We log the docUrl inside this method.
-				logger.warn("The DocLink < " + docLink + " > was not a docUrl (unexpected)!");
+				logger.warn("The DocLink < " + docLink + " > was not a " + ArgsUtils.targetUrlType + " (unexpected)!");
 				UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.visit()' method, as the retrieved DocLink: < " + docLink + " > was not a docUrl.", null, true, "true", "true", "false", "false", "false", null, "null");
 				return false;
 			}

diff --git a/...main/java/eu/openaire/publications_retriever/exceptions/DocFileNotRetrievedException.java b/...main/java/eu/openaire/publications_retriever/exceptions/DocFileNotRetrievedException.java
diff --git a/src/main/java/eu/openaire/publications_retriever/exceptions/FileNotRetrievedException.java b/src/main/java/eu/openaire/publications_retriever/exceptions/FileNotRetrievedException.java
@@ -0,0 +1,19 @@
+package eu.openaire.publications_retriever.exceptions;
+
+
+/**
+ * This class implements the new custom exception: "FileNotRetrievedException".
+ * This exception is used to signal a failure in retrieving a docFile.
+ * @author Lampros Smyrnaios
+ */
+public class FileNotRetrievedException extends Exception
+{
+	public FileNotRetrievedException()	{}
+
+	private String errorMessage = null;
+
+	public FileNotRetrievedException(String errorMessage) { this.errorMessage = errorMessage; }
+
+	@Override
+	public String getMessage() { return errorMessage; }
+}