- Fix concatenating some words when acquiring the HTML-code.

- Avoid some problematic urls. - Update dependencies. - Code polishing.
LSmyrnaios · Nov 2, 2023 · 4edfd67 · 4edfd67
1 parent f8ace6b
commit 4edfd67
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 15 deletions.
diff --git a/pom.xml b/pom.xml
@@ -61,7 +61,7 @@
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-surefire-plugin</artifactId>
-        <version>3.1.2</version>
+        <version>3.2.1</version>
         <configuration>
           <!--<excludes>
             <exclude>some test to exclude here</exclude>
@@ -106,7 +106,7 @@
     <dependency>
       <groupId>org.jsoup</groupId>
       <artifactId>jsoup</artifactId>
-      <version>1.16.1</version>
+      <version>1.16.2</version>
     </dependency>
 
     <!-- https://mvnrepository.com/artifact/com.google.guava/guava -->
@@ -127,7 +127,7 @@
     <dependency>
       <groupId>commons-io</groupId>
       <artifactId>commons-io</artifactId>
-      <version>2.14.0</version>
+      <version>2.15.0</version>
     </dependency>
 
     <!-- https://mvnrepository.com/artifact/org.json/json -->

diff --git a/src/main/java/eu/openaire/publications_retriever/crawler/MetaDocUrlsHandler.java b/src/main/java/eu/openaire/publications_retriever/crawler/MetaDocUrlsHandler.java
@@ -80,7 +80,7 @@ public static boolean checkIfAndHandleMetaDocUrl(String urlId, String sourceUrl,
             // Block the domain and return "true" to indicate handled-state.
             HttpConnUtils.blacklistedDomains.add(pageDomain);
             logger.warn("Domain: \"" + pageDomain + "\" was blocked, after giving a dynamic metaDocUrl: " + metaDocUrl);
-            UrlUtils.logOutputData(urlId, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'MetaDocUrlsHandler.checkIfAndHandleMetaDocUrl()' method, as its metaDocUrl was a dynamic-link.", null, true, "true", "true", "false", "false", "false", null, "null");  // We log the source-url, and that was discarded in "PageCrawler.visit()".
+            UrlUtils.logOutputData(urlId, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'MetaDocUrlsHandler.checkIfAndHandleMetaDocUrl()' method, as its metaDocUrl was a dynamic-link.", null, true, "true", "true", "false", "false", "false", null, "null");
             PageCrawler.contentProblematicUrls.incrementAndGet();
             return true;    // Since the domain is blocked, there is no point in continuing to crawl.
         }
@@ -165,7 +165,6 @@ public static String getMetaDocUrlFromHTML(String pageHtml)
             return null;    // It was not found and so it was not handled. We don't log the sourceUrl, since it will be handled later.
 
         //logger.debug("Matched meta-doc-url-line: " + metaDocUrlMatcher.group(0));	// DEBUG!!
-
         String metaDocUrl = null;
         try {
             metaDocUrl = metaDocUrlMatcher.group(1);

diff --git a/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java b/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java
@@ -770,8 +770,8 @@ public static String getHtmlString(HttpURLConnection conn, BufferedReader buffer
 			String inputLine;
 			while ( (inputLine = br.readLine()) != null )
 			{
-				if ( !inputLine.isEmpty() && (inputLine.length() != 1) && !SPACE_ONLY_LINE.matcher(inputLine).matches() ) {	// We check for (inputLine.length() != 1), as some lines contain an unrecognized byte.
-					htmlStrB.append(inputLine);
+				if ( !inputLine.isEmpty() && (inputLine.length() != 1) && !SPACE_ONLY_LINE.matcher(inputLine).matches() ) {	// We check for (inputLine.length() != 1), as some lines contain just an unrecognized byte.
+					htmlStrB.append(inputLine).append(" ");	// Add a space where the line-break was, in order to not join words in text, which are separated by a new line in the html-code.
 					//logger.debug(inputLine);	// DEBUG!
 				}
 			}
@@ -828,10 +828,13 @@ public static ArrayList<Object> detectContentTypeFromResponseBody(String finalUr
 					mimeType = "application/pdf";
 					calledForPossibleDocUrl = true;	// Important for the re-connection.
 					foundDetectedContentType = true;
+					// The "bufferedReader" has already been closed.
 				} else if ( detectedContentType.detectedContentType.equals("undefined") )
 					logger.debug("The url with the undeclared content type < " + finalUrlStr + " >, was examined and found to have UNDEFINED contentType.");
+					// The "bufferedReader" has already been closed.
 				else
 					warnMsg += "\nUnspecified \"detectedContentType\": " + detectedContentType.detectedContentType;
+					// Normally, we should never reach here. The BufferedReader should be null.
 			}
 			else	//  ( detectedContentType == null )
 				warnMsg += "\nCould not retrieve the response-body for url: " + finalUrlStr;
@@ -840,6 +843,7 @@ public static ArrayList<Object> detectContentTypeFromResponseBody(String finalUr
 			warnMsg += "\nThe initial connection was made with the \"HTTP-HEAD\" method, so there is no response-body to use to detect the content-type.";
 
 		if ( !foundDetectedContentType && wasConnectedWithHTTPGET ) {	// If it could be detected (by using the "GET"  method to take the response-body), but it was not, only then go and check if it should be blocked.
+			// The BufferedReader should be null here.
 			if ( ConnSupportUtils.countAndBlockDomainAfterTimes(HttpConnUtils.blacklistedDomains, HttpConnUtils.timesDomainsReturnedNoType, domainStr, timesToReturnNoTypeBeforeDomainBlocked, true) ) {
 				logger.warn(warnMsg);
 				logger.warn("Domain: \"" + domainStr + "\" was blocked after returning no Type-info more than " + timesToReturnNoTypeBeforeDomainBlocked + " times.");
@@ -888,10 +892,12 @@ public static DetectedContentType extractContentTypeFromResponseBody(HttpURLConn
 				logger.debug(Arrays.toString(inputLine.chars().toArray()));
 			}*/
 
-			//logger.debug("First line of RequestBody: " + inputLine);	// DEBUG!
+			// Check if the stream ended before we could find an "accepted" line.
 			if ( inputLine == null )
 				return null;
 
+			//logger.debug("First actual line of RequestBody: " + inputLine);	// DEBUG!
+
 			String lowerCaseInputLine = inputLine.toLowerCase();
 			//logger.debug(lowerCaseInputLine + "\nLength of line: "  + lowerCaseInputLine.length());	// DEBUG!
 			if ( HTML_STRING_MATCH.matcher(lowerCaseInputLine).matches() )
@@ -989,12 +995,8 @@ public static String getFullyFormedUrl(String pageUrl, String currentLink, URL u
 
 	public static boolean isJustAnHTTPSredirect(String currentUrl, String targetUrl)
 	{
-		// First check if we go from an http to an https in general.
-		if ( !currentUrl.startsWith("http://", 0) && !targetUrl.startsWith("https://", 0) )
-			return false;
-
-		// Take the url after the protocol and check if it's the same, if it is then we have our HTTPS redirect, if not then it's another type of redirect.
-		return haveOnlyProtocolDifference(currentUrl, targetUrl);
+		return ( currentUrl.startsWith("http://", 0) && targetUrl.startsWith("https://", 0)
+				&& haveOnlyProtocolDifference(currentUrl, targetUrl) );
 	}
 
 

diff --git a/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java b/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java
@@ -34,7 +34,7 @@ public class UrlTypeChecker
 					+ "|(?:(?:advanced[-]?)?search|search/advanced|search-results|(?:[e]?books|journals)(?:-catalog)?|issue|docs|oai|(?:abstracting-)?indexing|online[-]?early|honors|awards|meetings|calendar|diversity|scholarships|invo(?:ice|lved)|errata|classroom|publish(?:-with-us)?|upload|products|forgot|home|ethics|comics|podcast|trends|bestof|booksellers|recommendations|bibliographic|volume[s]?)[/]?$"	// Url ends with these. Note that some of them are likely to be part of a docUrl, for ex. the "/trends/"-dir.
 					// TODO - In case we have just DocUrls (not datasetUrls), exclude the following as well: "/(?:bibtext|dc(?:terms)?|tei|endnote)$", it could be added in another regex.. or do an initialization check and construct this regex based on the url-option provided.
 					+ "|rights[-]?permissions|publication[-]?ethics|advertising|reset[-]?password|\\*/|communit(?:y|ies)"
-					+ "|restricted|noaccess|crawlprevention|error|(?:mis|ab)use|\\?denied|gateway|defaultwebpage|sorryserver|cookie|notfound|(?:404|accessibility|invalid|catalog(?:ue|ar|o)?)\\." + htOrPhpExtensionsPattern + ").*");
+					+ "|restricted|noaccess|crawlprevention|error|(?:mis|ab)use|\\?denied|gateway|defaultwebpage|sorryserver|cookie|(?:page-)?not[-]?found|(?:404(?:_response)|accessibility|invalid|catalog(?:ue|ar|o)?)\\." + htOrPhpExtensionsPattern + ").*");
 
 	// We check them as a directory to avoid discarding publications' urls about these subjects. There's "acesso" (single "c") in Portuguese.. Also there's "autore" & "contatto" in Italian.