- Add support for compressed content.

- Make sure the html is being read using the "UTF-8" charset.
LSmyrnaios · Nov 29, 2023 · 65af29a · 65af29a
1 parent 905872c
commit 65af29a
Show file tree

Hide file tree

Showing 6 changed files with 127 additions and 34 deletions.
diff --git a/pom.xml b/pom.xml
@@ -116,6 +116,20 @@
       <version>32.1.3-jre</version>
     </dependency>
 
+    <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-compress -->
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-compress</artifactId>
+      <version>1.25.0</version>
+    </dependency>
+
+    <!-- https://mvnrepository.com/artifact/org.brotli/dec -->
+    <dependency>
+      <groupId>org.brotli</groupId>
+      <artifactId>dec</artifactId>
+      <version>0.1.2</version>
+    </dependency>
+
     <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
     <dependency>
       <groupId>org.apache.commons</groupId>

diff --git a/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java b/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java
@@ -88,7 +88,7 @@ public static void visit(String urlId, String sourceUrl, String pageUrl, String
 		}
 
 		String pageHtml = null;	// Get the pageHtml to parse the page.
-		if ( (pageHtml = ConnSupportUtils.getHtmlString(conn, bufferedReader)) == null ) {
+		if ( (pageHtml = ConnSupportUtils.getHtmlString(conn, bufferedReader, false)) == null ) {
 			logger.warn("Could not retrieve the HTML-code for pageUrl: " + pageUrl);
 			UrlUtils.logOutputData(urlId, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.visit()' method, as there was a problem retrieving its HTML-code. Its contentType is: '" + pageContentType + "'.", null, true, "true", "true", "false", "false", "true", null, "null");
 			LoaderAndChecker.connProblematicUrls.incrementAndGet();

diff --git a/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java b/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java
@@ -5,6 +5,7 @@
 import eu.openaire.publications_retriever.PublicationsRetriever;
 import eu.openaire.publications_retriever.crawler.MachineLearning;
 import eu.openaire.publications_retriever.exceptions.DocFileNotRetrievedException;
+import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
 import eu.openaire.publications_retriever.util.url.DataToBeLogged;
 import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
 import eu.openaire.publications_retriever.util.url.UrlUtils;
@@ -349,9 +350,14 @@ else if ( docFileNameType.equals(DocFileNameType.originalName) )
 		numOfDocFiles.incrementAndGet();
 
 		File docFile = docFileData.getDocFile();
+
+		InputStream inputStream = ConnSupportUtils.checkEncodingAndGetInputStream(conn, false);
+		if ( inputStream == null )
+			throw new DocFileNotRetrievedException("Could not acquire the inputStream!");
+
 		FileOutputStream fileOutputStream = docFileData.getFileOutputStream();
 
-		try ( BufferedInputStream inStream = new BufferedInputStream(conn.getInputStream(), fiveMb);
+		try ( BufferedInputStream inStream = new BufferedInputStream(inputStream, fiveMb);
 			  BufferedOutputStream outStream = new BufferedOutputStream(((fileOutputStream != null) ? fileOutputStream : new FileOutputStream(docFile)), fiveMb) )
 		{
 			int maxStoringWaitingTime = getMaxStoringWaitingTime(contentSize);
@@ -442,7 +448,11 @@ public static synchronized DocFileData storeDocFileWithNumberName(HttpURLConnect
 		File docFile = new File(storeDocFilesDir + (numOfDocFile++) + ".pdf");	// First use the "numOfDocFile" and then increment it.
 		// TODO - Later, on different fileTypes, take care of the extension properly.
 
-		try ( BufferedInputStream inStream = new BufferedInputStream(conn.getInputStream(), fiveMb);
+		InputStream inputStream = ConnSupportUtils.checkEncodingAndGetInputStream(conn, false);
+		if ( inputStream == null )
+			throw new DocFileNotRetrievedException("Could not acquire the inputStream!");
+
+		try ( BufferedInputStream inStream = new BufferedInputStream(inputStream, fiveMb);
 			BufferedOutputStream outStream = new BufferedOutputStream(new FileOutputStream(docFile), fiveMb))
 		{
 			int maxStoringWaitingTime = getMaxStoringWaitingTime(contentSize);

diff --git a/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java b/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java
@@ -14,6 +14,9 @@
 import eu.openaire.publications_retriever.util.file.IdUrlTuple;
 import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
 import eu.openaire.publications_retriever.util.url.UrlUtils;
+import org.apache.commons.compress.compressors.brotli.BrotliCompressorInputStream;
+import org.apache.commons.compress.compressors.deflate.DeflateCompressorInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
 import org.apache.commons.io.FileDeleteStrategy;
 import org.apache.commons.lang3.StringUtils;
 
@@ -24,6 +27,7 @@
 import java.io.*;
 import java.net.HttpURLConnection;
 import java.net.URL;
+import java.nio.charset.StandardCharsets;
 import java.time.Duration;
 import java.time.Instant;
 import java.util.*;
@@ -435,7 +439,7 @@ public static String getInternalLinkFromHTTP300Page(HttpURLConnection conn)
 	{
 		try {
 			String html = null;
-			if ( (html = ConnSupportUtils.getHtmlString(conn, null)) == null ) {
+			if ( (html = ConnSupportUtils.getHtmlString(conn, null, false)) == null ) {
 				logger.warn("Could not retrieve the HTML-code for HTTP300PageUrl: " + conn.getURL().toString());
 				return null;
 			}
@@ -517,36 +521,76 @@ public static String onErrorStatusCode(String urlStr, String domainStr, int erro
 	}
 
 
-	public static String getErrorMessageFromResponseBody(HttpURLConnection conn)
+	public static InputStream checkEncodingAndGetInputStream(HttpURLConnection conn, boolean isForError)
 	{
-		final StringBuilder msgStrB = new StringBuilder(500);
-		try ( InputStream inputStream = conn.getErrorStream() ) {
-			if ( inputStream == null )	// No error-data is provided.
+		InputStream inputStream = null;
+		try {
+			inputStream = (isForError ? conn.getErrorStream() : conn.getInputStream());
+			if ( isForError && (inputStream == null) )	// Only the "getErrorStream" may return null.
 				return null;
+		} catch (Exception e) {
+			logger.error("", e);
+			return null;
+		}
+		// Determine the potential encoding
+		String encoding = conn.getHeaderField("content-encoding");
+		if ( encoding != null ) {
+			String url = conn.getURL().toString();
+			if ( logger.isTraceEnabled() )
+				logger.trace("Url \"" + url + "\" has content-encoding: " + encoding);
+			InputStream compressedInputStream = getCompressedInputStream(inputStream, encoding, url, isForError);
+			if ( compressedInputStream == null ) {
+				try {
+					inputStream.close();
+				} catch (IOException ioe) {}
+				return null;    // The error is logged inside the called method.
+			}
+			inputStream = compressedInputStream;
+		}
 
-			try ( BufferedReader br = new BufferedReader(new InputStreamReader(inputStream)) ) {
-				String inputLine;
-				while ( (inputLine = br.readLine()) != null ) {	// Returns the line, without the ending-line characters.
-					if ( !inputLine.isEmpty() )
-						msgStrB.append(inputLine).append(" ");	// We want a single finale line, not a multi-line text.
-				}
-
-				if ( msgStrB.length() == 0 )
-					return null;	// Make sure we return a "null" on empty string, to better handle the case in the caller-function.
+		return inputStream;
+	}
 
-				String errorText = Jsoup.parse(msgStrB.toString()).text();	// It's already "trimmed".
-				if ( errorText.length() == 0 )
-					return null;
 
-				return errorText;
-			} catch (IOException ioe) {
-				logger.error("IOException when retrieving the response-body: " + ioe.getMessage());
+	public static InputStream getCompressedInputStream(InputStream inputStream, String encoding, String url, boolean isForError)
+	{
+		InputStream compressedInputStream;
+		try {
+			if ( encoding.equals("gzip") )
+				compressedInputStream = new GzipCompressorInputStream(inputStream);
+			else if ( encoding.equals("deflate") )
+				compressedInputStream = new DeflateCompressorInputStream(inputStream);
+			else if ( encoding.equals("br") )
+				compressedInputStream = new BrotliCompressorInputStream(inputStream);
+			else {
+				logger.warn("An unsupported \"content-encoding\" (" + encoding + ") was received from url: " + url);
 				return null;
 			}
-		} catch (Exception e) {
-			logger.error("Could not extract the response-body!", e);
+		} catch (IOException ioe) {
+			String exMsg = ioe.getMessage();
+			if ( exMsg.startsWith("Input is not in the") )
+				logger.warn(exMsg + " | http-published-encoding: " + encoding + " | url: " + url);
+				// Some urls do not return valid html-either way.
+			else
+				logger.error("Could not acquire the compressorInputStream for encoding: " + encoding + " | url: " + url, ioe);
 			return null;
 		}
+		return compressedInputStream;
+	}
+
+
+	public static String getErrorMessageFromResponseBody(HttpURLConnection conn)
+	{
+		String html = getHtmlString(conn, null, true);
+		if ( html == null )
+			return null;
+
+		int htmlLength = html.length();
+		if ( (htmlLength == 0) || (htmlLength > 10000) )
+			return null;
+
+		String errorText = Jsoup.parse(html).text();	// The result is already "trimmed".
+		return ((errorText.length() > 0) ? errorText : null);
 	}
 
 
@@ -752,7 +796,7 @@ public static List<String> blockSharedSiteSessionDomains(String targetUrl, Strin
 
 	public static ThreadLocal<StringBuilder> htmlStrBuilder = new ThreadLocal<StringBuilder>();	// Every Thread has its own variable.
 
-	public static String getHtmlString(HttpURLConnection conn, BufferedReader bufferedReader)
+	public static String getHtmlString(HttpURLConnection conn, BufferedReader bufferedReader, boolean isForError)
 	{
 		if ( getContentSize(conn, false) == -1 ) {	// "Unacceptable size"-code..
 			logger.warn("Aborting HTML-extraction for pageUrl: " + conn.getURL().toString());
@@ -766,7 +810,14 @@ public static String getHtmlString(HttpURLConnection conn, BufferedReader buffer
 			htmlStrBuilder.set(htmlStrB);	// Save it for future use by this thread.
 		}
 
-		try (BufferedReader br = (bufferedReader != null ? bufferedReader : new BufferedReader(new InputStreamReader(conn.getInputStream()), FileUtils.fiveMb)) )	// Try-with-resources
+		InputStream inputStream = null;
+		if ( bufferedReader == null ) {
+			inputStream = checkEncodingAndGetInputStream(conn, isForError);
+			if ( inputStream == null )	// The error is already logged inside.
+				return null;
+		}
+
+		try (BufferedReader br = ((bufferedReader != null) ? bufferedReader : new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8), FileUtils.fiveMb)) )	// Try-with-resources
 		{
 			String inputLine;
 			while ( (inputLine = br.readLine()) != null )
@@ -788,6 +839,12 @@ public static String getHtmlString(HttpURLConnection conn, BufferedReader buffer
 			return null;
 		} finally {
 			htmlStrB.setLength(0);	// Reset "StringBuilder" WITHOUT re-allocating.
+			try {
+				if ( inputStream != null )
+					inputStream.close();
+			} catch (IOException ioe) {
+				// Ignore.
+			}
 		}
 	}
 
@@ -878,9 +935,13 @@ public static DetectedContentType extractContentTypeFromResponseBody(HttpURLConn
 			return null;
 		}
 
+		InputStream inputStream = checkEncodingAndGetInputStream(conn, false);
+		if ( inputStream == null )
+			return null;
+
 		BufferedReader br = null;
 		try {
-			br = new BufferedReader(new InputStreamReader(conn.getInputStream()), FileUtils.fiveMb);
+			br = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8), FileUtils.fiveMb);
 			String inputLine;
 
 			// Skip empty lines in the beginning of the HTML-code
@@ -1052,7 +1113,6 @@ public static boolean haveOnlyProtocolDifference(String url1, String url2)
 
 	public static InputStream getInputStreamFromInputDataUrl()
 	{
-		InputStream inputStream = null;
 		if ( (PublicationsRetriever.inputDataUrl == null) || PublicationsRetriever.inputDataUrl.isEmpty() ) {
 			String errorMessage = "The \"inputDataUrl\" was not given, even though";
 			logger.error(errorMessage);
@@ -1061,6 +1121,7 @@ public static InputStream getInputStreamFromInputDataUrl()
 			System.exit(55);
 		}
 
+		InputStream inputStream = null;
 		try {
 			HttpURLConnection conn = HttpConnUtils.handleConnection(null, PublicationsRetriever.inputDataUrl, PublicationsRetriever.inputDataUrl, PublicationsRetriever.inputDataUrl, null, true, true);
 			String mimeType = conn.getHeaderField("Content-Type");
@@ -1072,7 +1133,12 @@ public static InputStream getInputStreamFromInputDataUrl()
 				System.exit(56);
 			}
 
-			inputStream = new BufferedInputStream(conn.getInputStream(), FileUtils.fiveMb);
+			inputStream = ConnSupportUtils.checkEncodingAndGetInputStream(conn, false);
+			if ( inputStream == null )
+				throw new RuntimeException("Could not acquire the InputStream!");
+
+			// Wrap it with a buffer, for increased efficiency.
+			inputStream = new BufferedInputStream(inputStream, FileUtils.fiveMb);
 
 		} catch (Exception e) {
 			String errorMessage = "Unexpected error when retrieving the input-stream from the inputDataUrl:\n" + e.getMessage();

diff --git a/src/main/java/eu/openaire/publications_retriever/util/http/HttpConnUtils.java b/src/main/java/eu/openaire/publications_retriever/util/http/HttpConnUtils.java
@@ -328,7 +328,13 @@ public static HttpURLConnection openHttpConnection(String resourceURL, String do
 			conn = (HttpURLConnection) url.openConnection();
 			conn.setRequestProperty("User-Agent", userAgent);
 			conn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8");
-			//conn.setRequestProperty("Accept-Encoding", "gzip, deflate, br");
+
+			conn.setRequestProperty("Accept-Encoding", "gzip, deflate, br");
+			//conn.setRequestProperty("TE", "trailers");	// TODO - Investigate the "transfer-encoding" header.
+
+			if ( !domainsWithUnsupportedAcceptLanguageParameter.contains(domainStr) )
+				conn.setRequestProperty("Accept-Language", acceptLanguage);
+
 			conn.setRequestProperty("DNT", "1");
 			conn.setRequestProperty("Connection", "keep-alive");
 			conn.setRequestProperty("Sec-Fetch-Dest", "document");
@@ -339,8 +345,6 @@ public static HttpURLConnection openHttpConnection(String resourceURL, String do
 			conn.setRequestProperty("Cache-Control", "no-cache");
 			conn.setRequestProperty("Host", domainStr);
 
-			if ( !domainsWithUnsupportedAcceptLanguageParameter.contains(domainStr) )
-				conn.setRequestProperty("Accept-Language", acceptLanguage);
 			conn.setInstanceFollowRedirects(false);	// We manage redirects on our own, in order to control redirectsNum, avoid redirecting to unwantedUrls and handling errors.
 
 			boolean useHttpGetMethod = false;

diff --git a/src/test/java/eu/openaire/publications_retriever/test/UrlChecker.java b/src/test/java/eu/openaire/publications_retriever/test/UrlChecker.java
@@ -1,7 +1,6 @@
 package eu.openaire.publications_retriever.test;
 
 import com.google.common.collect.HashMultimap;
-import crawlercommons.filters.basic.BasicURLNormalizer;
 import eu.openaire.publications_retriever.PublicationsRetriever;
 import eu.openaire.publications_retriever.crawler.PageCrawler;
 import eu.openaire.publications_retriever.util.file.FileUtils;