diff --git a/pom.xml b/pom.xml
index 48ae523..d9cf90e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -116,6 +116,20 @@
32.1.3-jre
+
+
+ org.apache.commons
+ commons-compress
+ 1.25.0
+
+
+
+
+ org.brotli
+ dec
+ 0.1.2
+
+
org.apache.commons
diff --git a/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java b/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java
index 61145ec..38e7b13 100644
--- a/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java
+++ b/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java
@@ -88,7 +88,7 @@ public static void visit(String urlId, String sourceUrl, String pageUrl, String
}
String pageHtml = null; // Get the pageHtml to parse the page.
- if ( (pageHtml = ConnSupportUtils.getHtmlString(conn, bufferedReader)) == null ) {
+ if ( (pageHtml = ConnSupportUtils.getHtmlString(conn, bufferedReader, false)) == null ) {
logger.warn("Could not retrieve the HTML-code for pageUrl: " + pageUrl);
UrlUtils.logOutputData(urlId, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.visit()' method, as there was a problem retrieving its HTML-code. Its contentType is: '" + pageContentType + "'.", null, true, "true", "true", "false", "false", "true", null, "null");
LoaderAndChecker.connProblematicUrls.incrementAndGet();
diff --git a/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java b/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java
index 54aec23..96a7260 100644
--- a/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java
+++ b/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java
@@ -5,6 +5,7 @@
import eu.openaire.publications_retriever.PublicationsRetriever;
import eu.openaire.publications_retriever.crawler.MachineLearning;
import eu.openaire.publications_retriever.exceptions.DocFileNotRetrievedException;
+import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import eu.openaire.publications_retriever.util.url.DataToBeLogged;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
import eu.openaire.publications_retriever.util.url.UrlUtils;
@@ -349,9 +350,14 @@ else if ( docFileNameType.equals(DocFileNameType.originalName) )
numOfDocFiles.incrementAndGet();
File docFile = docFileData.getDocFile();
+
+ InputStream inputStream = ConnSupportUtils.checkEncodingAndGetInputStream(conn, false);
+ if ( inputStream == null )
+ throw new DocFileNotRetrievedException("Could not acquire the inputStream!");
+
FileOutputStream fileOutputStream = docFileData.getFileOutputStream();
- try ( BufferedInputStream inStream = new BufferedInputStream(conn.getInputStream(), fiveMb);
+ try ( BufferedInputStream inStream = new BufferedInputStream(inputStream, fiveMb);
BufferedOutputStream outStream = new BufferedOutputStream(((fileOutputStream != null) ? fileOutputStream : new FileOutputStream(docFile)), fiveMb) )
{
int maxStoringWaitingTime = getMaxStoringWaitingTime(contentSize);
@@ -442,7 +448,11 @@ public static synchronized DocFileData storeDocFileWithNumberName(HttpURLConnect
File docFile = new File(storeDocFilesDir + (numOfDocFile++) + ".pdf"); // First use the "numOfDocFile" and then increment it.
// TODO - Later, on different fileTypes, take care of the extension properly.
- try ( BufferedInputStream inStream = new BufferedInputStream(conn.getInputStream(), fiveMb);
+ InputStream inputStream = ConnSupportUtils.checkEncodingAndGetInputStream(conn, false);
+ if ( inputStream == null )
+ throw new DocFileNotRetrievedException("Could not acquire the inputStream!");
+
+ try ( BufferedInputStream inStream = new BufferedInputStream(inputStream, fiveMb);
BufferedOutputStream outStream = new BufferedOutputStream(new FileOutputStream(docFile), fiveMb))
{
int maxStoringWaitingTime = getMaxStoringWaitingTime(contentSize);
diff --git a/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java b/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java
index 5f43af0..d2f9e1e 100644
--- a/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java
+++ b/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java
@@ -14,6 +14,9 @@
import eu.openaire.publications_retriever.util.file.IdUrlTuple;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
import eu.openaire.publications_retriever.util.url.UrlUtils;
+import org.apache.commons.compress.compressors.brotli.BrotliCompressorInputStream;
+import org.apache.commons.compress.compressors.deflate.DeflateCompressorInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.io.FileDeleteStrategy;
import org.apache.commons.lang3.StringUtils;
@@ -24,6 +27,7 @@
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
+import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.time.Instant;
import java.util.*;
@@ -435,7 +439,7 @@ public static String getInternalLinkFromHTTP300Page(HttpURLConnection conn)
{
try {
String html = null;
- if ( (html = ConnSupportUtils.getHtmlString(conn, null)) == null ) {
+ if ( (html = ConnSupportUtils.getHtmlString(conn, null, false)) == null ) {
logger.warn("Could not retrieve the HTML-code for HTTP300PageUrl: " + conn.getURL().toString());
return null;
}
@@ -517,36 +521,76 @@ public static String onErrorStatusCode(String urlStr, String domainStr, int erro
}
- public static String getErrorMessageFromResponseBody(HttpURLConnection conn)
+ public static InputStream checkEncodingAndGetInputStream(HttpURLConnection conn, boolean isForError)
{
- final StringBuilder msgStrB = new StringBuilder(500);
- try ( InputStream inputStream = conn.getErrorStream() ) {
- if ( inputStream == null ) // No error-data is provided.
+ InputStream inputStream = null;
+ try {
+ inputStream = (isForError ? conn.getErrorStream() : conn.getInputStream());
+ if ( isForError && (inputStream == null) ) // Only the "getErrorStream" may return null.
return null;
+ } catch (Exception e) {
+ logger.error("", e);
+ return null;
+ }
+ // Determine the potential encoding
+ String encoding = conn.getHeaderField("content-encoding");
+ if ( encoding != null ) {
+ String url = conn.getURL().toString();
+ if ( logger.isTraceEnabled() )
+ logger.trace("Url \"" + url + "\" has content-encoding: " + encoding);
+ InputStream compressedInputStream = getCompressedInputStream(inputStream, encoding, url, isForError);
+ if ( compressedInputStream == null ) {
+ try {
+ inputStream.close();
+ } catch (IOException ioe) {}
+ return null; // The error is logged inside the called method.
+ }
+ inputStream = compressedInputStream;
+ }
- try ( BufferedReader br = new BufferedReader(new InputStreamReader(inputStream)) ) {
- String inputLine;
- while ( (inputLine = br.readLine()) != null ) { // Returns the line, without the ending-line characters.
- if ( !inputLine.isEmpty() )
- msgStrB.append(inputLine).append(" "); // We want a single finale line, not a multi-line text.
- }
-
- if ( msgStrB.length() == 0 )
- return null; // Make sure we return a "null" on empty string, to better handle the case in the caller-function.
+ return inputStream;
+ }
- String errorText = Jsoup.parse(msgStrB.toString()).text(); // It's already "trimmed".
- if ( errorText.length() == 0 )
- return null;
- return errorText;
- } catch (IOException ioe) {
- logger.error("IOException when retrieving the response-body: " + ioe.getMessage());
+ public static InputStream getCompressedInputStream(InputStream inputStream, String encoding, String url, boolean isForError)
+ {
+ InputStream compressedInputStream;
+ try {
+ if ( encoding.equals("gzip") )
+ compressedInputStream = new GzipCompressorInputStream(inputStream);
+ else if ( encoding.equals("deflate") )
+ compressedInputStream = new DeflateCompressorInputStream(inputStream);
+ else if ( encoding.equals("br") )
+ compressedInputStream = new BrotliCompressorInputStream(inputStream);
+ else {
+ logger.warn("An unsupported \"content-encoding\" (" + encoding + ") was received from url: " + url);
return null;
}
- } catch (Exception e) {
- logger.error("Could not extract the response-body!", e);
+ } catch (IOException ioe) {
+ String exMsg = ioe.getMessage();
+ if ( exMsg.startsWith("Input is not in the") )
+ logger.warn(exMsg + " | http-published-encoding: " + encoding + " | url: " + url);
+ // Some urls do not return valid html-either way.
+ else
+ logger.error("Could not acquire the compressorInputStream for encoding: " + encoding + " | url: " + url, ioe);
return null;
}
+ return compressedInputStream;
+ }
+
+
+ public static String getErrorMessageFromResponseBody(HttpURLConnection conn)
+ {
+ String html = getHtmlString(conn, null, true);
+ if ( html == null )
+ return null;
+
+ int htmlLength = html.length();
+ if ( (htmlLength == 0) || (htmlLength > 10000) )
+ return null;
+
+ String errorText = Jsoup.parse(html).text(); // The result is already "trimmed".
+ return ((errorText.length() > 0) ? errorText : null);
}
@@ -752,7 +796,7 @@ public static List blockSharedSiteSessionDomains(String targetUrl, Strin
public static ThreadLocal htmlStrBuilder = new ThreadLocal(); // Every Thread has its own variable.
- public static String getHtmlString(HttpURLConnection conn, BufferedReader bufferedReader)
+ public static String getHtmlString(HttpURLConnection conn, BufferedReader bufferedReader, boolean isForError)
{
if ( getContentSize(conn, false) == -1 ) { // "Unacceptable size"-code..
logger.warn("Aborting HTML-extraction for pageUrl: " + conn.getURL().toString());
@@ -766,7 +810,14 @@ public static String getHtmlString(HttpURLConnection conn, BufferedReader buffer
htmlStrBuilder.set(htmlStrB); // Save it for future use by this thread.
}
- try (BufferedReader br = (bufferedReader != null ? bufferedReader : new BufferedReader(new InputStreamReader(conn.getInputStream()), FileUtils.fiveMb)) ) // Try-with-resources
+ InputStream inputStream = null;
+ if ( bufferedReader == null ) {
+ inputStream = checkEncodingAndGetInputStream(conn, isForError);
+ if ( inputStream == null ) // The error is already logged inside.
+ return null;
+ }
+
+ try (BufferedReader br = ((bufferedReader != null) ? bufferedReader : new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8), FileUtils.fiveMb)) ) // Try-with-resources
{
String inputLine;
while ( (inputLine = br.readLine()) != null )
@@ -788,6 +839,12 @@ public static String getHtmlString(HttpURLConnection conn, BufferedReader buffer
return null;
} finally {
htmlStrB.setLength(0); // Reset "StringBuilder" WITHOUT re-allocating.
+ try {
+ if ( inputStream != null )
+ inputStream.close();
+ } catch (IOException ioe) {
+ // Ignore.
+ }
}
}
@@ -878,9 +935,13 @@ public static DetectedContentType extractContentTypeFromResponseBody(HttpURLConn
return null;
}
+ InputStream inputStream = checkEncodingAndGetInputStream(conn, false);
+ if ( inputStream == null )
+ return null;
+
BufferedReader br = null;
try {
- br = new BufferedReader(new InputStreamReader(conn.getInputStream()), FileUtils.fiveMb);
+ br = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8), FileUtils.fiveMb);
String inputLine;
// Skip empty lines in the beginning of the HTML-code
@@ -1052,7 +1113,6 @@ public static boolean haveOnlyProtocolDifference(String url1, String url2)
public static InputStream getInputStreamFromInputDataUrl()
{
- InputStream inputStream = null;
if ( (PublicationsRetriever.inputDataUrl == null) || PublicationsRetriever.inputDataUrl.isEmpty() ) {
String errorMessage = "The \"inputDataUrl\" was not given, even though";
logger.error(errorMessage);
@@ -1061,6 +1121,7 @@ public static InputStream getInputStreamFromInputDataUrl()
System.exit(55);
}
+ InputStream inputStream = null;
try {
HttpURLConnection conn = HttpConnUtils.handleConnection(null, PublicationsRetriever.inputDataUrl, PublicationsRetriever.inputDataUrl, PublicationsRetriever.inputDataUrl, null, true, true);
String mimeType = conn.getHeaderField("Content-Type");
@@ -1072,7 +1133,12 @@ public static InputStream getInputStreamFromInputDataUrl()
System.exit(56);
}
- inputStream = new BufferedInputStream(conn.getInputStream(), FileUtils.fiveMb);
+ inputStream = ConnSupportUtils.checkEncodingAndGetInputStream(conn, false);
+ if ( inputStream == null )
+ throw new RuntimeException("Could not acquire the InputStream!");
+
+ // Wrap it with a buffer, for increased efficiency.
+ inputStream = new BufferedInputStream(inputStream, FileUtils.fiveMb);
} catch (Exception e) {
String errorMessage = "Unexpected error when retrieving the input-stream from the inputDataUrl:\n" + e.getMessage();
diff --git a/src/main/java/eu/openaire/publications_retriever/util/http/HttpConnUtils.java b/src/main/java/eu/openaire/publications_retriever/util/http/HttpConnUtils.java
index 0765236..81f3a3c 100644
--- a/src/main/java/eu/openaire/publications_retriever/util/http/HttpConnUtils.java
+++ b/src/main/java/eu/openaire/publications_retriever/util/http/HttpConnUtils.java
@@ -328,7 +328,13 @@ public static HttpURLConnection openHttpConnection(String resourceURL, String do
conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty("User-Agent", userAgent);
conn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8");
- //conn.setRequestProperty("Accept-Encoding", "gzip, deflate, br");
+
+ conn.setRequestProperty("Accept-Encoding", "gzip, deflate, br");
+ //conn.setRequestProperty("TE", "trailers"); // TODO - Investigate the "transfer-encoding" header.
+
+ if ( !domainsWithUnsupportedAcceptLanguageParameter.contains(domainStr) )
+ conn.setRequestProperty("Accept-Language", acceptLanguage);
+
conn.setRequestProperty("DNT", "1");
conn.setRequestProperty("Connection", "keep-alive");
conn.setRequestProperty("Sec-Fetch-Dest", "document");
@@ -339,8 +345,6 @@ public static HttpURLConnection openHttpConnection(String resourceURL, String do
conn.setRequestProperty("Cache-Control", "no-cache");
conn.setRequestProperty("Host", domainStr);
- if ( !domainsWithUnsupportedAcceptLanguageParameter.contains(domainStr) )
- conn.setRequestProperty("Accept-Language", acceptLanguage);
conn.setInstanceFollowRedirects(false); // We manage redirects on our own, in order to control redirectsNum, avoid redirecting to unwantedUrls and handling errors.
boolean useHttpGetMethod = false;
diff --git a/src/test/java/eu/openaire/publications_retriever/test/UrlChecker.java b/src/test/java/eu/openaire/publications_retriever/test/UrlChecker.java
index 065d765..756fa72 100644
--- a/src/test/java/eu/openaire/publications_retriever/test/UrlChecker.java
+++ b/src/test/java/eu/openaire/publications_retriever/test/UrlChecker.java
@@ -1,7 +1,6 @@
package eu.openaire.publications_retriever.test;
import com.google.common.collect.HashMultimap;
-import crawlercommons.filters.basic.BasicURLNormalizer;
import eu.openaire.publications_retriever.PublicationsRetriever;
import eu.openaire.publications_retriever.crawler.PageCrawler;
import eu.openaire.publications_retriever.util.file.FileUtils;