Skip to content

Commit

Permalink
- Add support for compressed content.
Browse files Browse the repository at this point in the history
- Make sure the html is being read using the "UTF-8" charset.
  • Loading branch information
LSmyrnaios committed Nov 29, 2023
1 parent 905872c commit 65af29a
Show file tree
Hide file tree
Showing 6 changed files with 127 additions and 34 deletions.
14 changes: 14 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,20 @@
<version>32.1.3-jre</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-compress -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
<version>1.25.0</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.brotli/dec -->
<dependency>
<groupId>org.brotli</groupId>
<artifactId>dec</artifactId>
<version>0.1.2</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
<dependency>
<groupId>org.apache.commons</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ public static void visit(String urlId, String sourceUrl, String pageUrl, String
}

String pageHtml = null; // Get the pageHtml to parse the page.
if ( (pageHtml = ConnSupportUtils.getHtmlString(conn, bufferedReader)) == null ) {
if ( (pageHtml = ConnSupportUtils.getHtmlString(conn, bufferedReader, false)) == null ) {
logger.warn("Could not retrieve the HTML-code for pageUrl: " + pageUrl);
UrlUtils.logOutputData(urlId, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.visit()' method, as there was a problem retrieving its HTML-code. Its contentType is: '" + pageContentType + "'.", null, true, "true", "true", "false", "false", "true", null, "null");
LoaderAndChecker.connProblematicUrls.incrementAndGet();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import eu.openaire.publications_retriever.PublicationsRetriever;
import eu.openaire.publications_retriever.crawler.MachineLearning;
import eu.openaire.publications_retriever.exceptions.DocFileNotRetrievedException;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import eu.openaire.publications_retriever.util.url.DataToBeLogged;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
import eu.openaire.publications_retriever.util.url.UrlUtils;
Expand Down Expand Up @@ -349,9 +350,14 @@ else if ( docFileNameType.equals(DocFileNameType.originalName) )
numOfDocFiles.incrementAndGet();

File docFile = docFileData.getDocFile();

InputStream inputStream = ConnSupportUtils.checkEncodingAndGetInputStream(conn, false);
if ( inputStream == null )
throw new DocFileNotRetrievedException("Could not acquire the inputStream!");

FileOutputStream fileOutputStream = docFileData.getFileOutputStream();

try ( BufferedInputStream inStream = new BufferedInputStream(conn.getInputStream(), fiveMb);
try ( BufferedInputStream inStream = new BufferedInputStream(inputStream, fiveMb);
BufferedOutputStream outStream = new BufferedOutputStream(((fileOutputStream != null) ? fileOutputStream : new FileOutputStream(docFile)), fiveMb) )
{
int maxStoringWaitingTime = getMaxStoringWaitingTime(contentSize);
Expand Down Expand Up @@ -442,7 +448,11 @@ public static synchronized DocFileData storeDocFileWithNumberName(HttpURLConnect
File docFile = new File(storeDocFilesDir + (numOfDocFile++) + ".pdf"); // First use the "numOfDocFile" and then increment it.
// TODO - Later, on different fileTypes, take care of the extension properly.

try ( BufferedInputStream inStream = new BufferedInputStream(conn.getInputStream(), fiveMb);
InputStream inputStream = ConnSupportUtils.checkEncodingAndGetInputStream(conn, false);
if ( inputStream == null )
throw new DocFileNotRetrievedException("Could not acquire the inputStream!");

try ( BufferedInputStream inStream = new BufferedInputStream(inputStream, fiveMb);
BufferedOutputStream outStream = new BufferedOutputStream(new FileOutputStream(docFile), fiveMb))
{
int maxStoringWaitingTime = getMaxStoringWaitingTime(contentSize);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
import eu.openaire.publications_retriever.util.file.IdUrlTuple;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
import eu.openaire.publications_retriever.util.url.UrlUtils;
import org.apache.commons.compress.compressors.brotli.BrotliCompressorInputStream;
import org.apache.commons.compress.compressors.deflate.DeflateCompressorInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.io.FileDeleteStrategy;
import org.apache.commons.lang3.StringUtils;

Expand All @@ -24,6 +27,7 @@
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.time.Instant;
import java.util.*;
Expand Down Expand Up @@ -435,7 +439,7 @@ public static String getInternalLinkFromHTTP300Page(HttpURLConnection conn)
{
try {
String html = null;
if ( (html = ConnSupportUtils.getHtmlString(conn, null)) == null ) {
if ( (html = ConnSupportUtils.getHtmlString(conn, null, false)) == null ) {
logger.warn("Could not retrieve the HTML-code for HTTP300PageUrl: " + conn.getURL().toString());
return null;
}
Expand Down Expand Up @@ -517,36 +521,76 @@ public static String onErrorStatusCode(String urlStr, String domainStr, int erro
}


public static String getErrorMessageFromResponseBody(HttpURLConnection conn)
public static InputStream checkEncodingAndGetInputStream(HttpURLConnection conn, boolean isForError)
{
final StringBuilder msgStrB = new StringBuilder(500);
try ( InputStream inputStream = conn.getErrorStream() ) {
if ( inputStream == null ) // No error-data is provided.
InputStream inputStream = null;
try {
inputStream = (isForError ? conn.getErrorStream() : conn.getInputStream());
if ( isForError && (inputStream == null) ) // Only the "getErrorStream" may return null.
return null;
} catch (Exception e) {
logger.error("", e);
return null;
}
// Determine the potential encoding
String encoding = conn.getHeaderField("content-encoding");
if ( encoding != null ) {
String url = conn.getURL().toString();
if ( logger.isTraceEnabled() )
logger.trace("Url \"" + url + "\" has content-encoding: " + encoding);
InputStream compressedInputStream = getCompressedInputStream(inputStream, encoding, url, isForError);
if ( compressedInputStream == null ) {
try {
inputStream.close();
} catch (IOException ioe) {}
return null; // The error is logged inside the called method.
}
inputStream = compressedInputStream;
}

try ( BufferedReader br = new BufferedReader(new InputStreamReader(inputStream)) ) {
String inputLine;
while ( (inputLine = br.readLine()) != null ) { // Returns the line, without the ending-line characters.
if ( !inputLine.isEmpty() )
msgStrB.append(inputLine).append(" "); // We want a single finale line, not a multi-line text.
}

if ( msgStrB.length() == 0 )
return null; // Make sure we return a "null" on empty string, to better handle the case in the caller-function.
return inputStream;
}

String errorText = Jsoup.parse(msgStrB.toString()).text(); // It's already "trimmed".
if ( errorText.length() == 0 )
return null;

return errorText;
} catch (IOException ioe) {
logger.error("IOException when retrieving the response-body: " + ioe.getMessage());
public static InputStream getCompressedInputStream(InputStream inputStream, String encoding, String url, boolean isForError)
{
InputStream compressedInputStream;
try {
if ( encoding.equals("gzip") )
compressedInputStream = new GzipCompressorInputStream(inputStream);
else if ( encoding.equals("deflate") )
compressedInputStream = new DeflateCompressorInputStream(inputStream);
else if ( encoding.equals("br") )
compressedInputStream = new BrotliCompressorInputStream(inputStream);
else {
logger.warn("An unsupported \"content-encoding\" (" + encoding + ") was received from url: " + url);
return null;
}
} catch (Exception e) {
logger.error("Could not extract the response-body!", e);
} catch (IOException ioe) {
String exMsg = ioe.getMessage();
if ( exMsg.startsWith("Input is not in the") )
logger.warn(exMsg + " | http-published-encoding: " + encoding + " | url: " + url);
// Some urls do not return valid html-either way.
else
logger.error("Could not acquire the compressorInputStream for encoding: " + encoding + " | url: " + url, ioe);
return null;
}
return compressedInputStream;
}


public static String getErrorMessageFromResponseBody(HttpURLConnection conn)
{
String html = getHtmlString(conn, null, true);
if ( html == null )
return null;

int htmlLength = html.length();
if ( (htmlLength == 0) || (htmlLength > 10000) )
return null;

String errorText = Jsoup.parse(html).text(); // The result is already "trimmed".
return ((errorText.length() > 0) ? errorText : null);
}


Expand Down Expand Up @@ -752,7 +796,7 @@ public static List<String> blockSharedSiteSessionDomains(String targetUrl, Strin

public static ThreadLocal<StringBuilder> htmlStrBuilder = new ThreadLocal<StringBuilder>(); // Every Thread has its own variable.

public static String getHtmlString(HttpURLConnection conn, BufferedReader bufferedReader)
public static String getHtmlString(HttpURLConnection conn, BufferedReader bufferedReader, boolean isForError)
{
if ( getContentSize(conn, false) == -1 ) { // "Unacceptable size"-code..
logger.warn("Aborting HTML-extraction for pageUrl: " + conn.getURL().toString());
Expand All @@ -766,7 +810,14 @@ public static String getHtmlString(HttpURLConnection conn, BufferedReader buffer
htmlStrBuilder.set(htmlStrB); // Save it for future use by this thread.
}

try (BufferedReader br = (bufferedReader != null ? bufferedReader : new BufferedReader(new InputStreamReader(conn.getInputStream()), FileUtils.fiveMb)) ) // Try-with-resources
InputStream inputStream = null;
if ( bufferedReader == null ) {
inputStream = checkEncodingAndGetInputStream(conn, isForError);
if ( inputStream == null ) // The error is already logged inside.
return null;
}

try (BufferedReader br = ((bufferedReader != null) ? bufferedReader : new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8), FileUtils.fiveMb)) ) // Try-with-resources
{
String inputLine;
while ( (inputLine = br.readLine()) != null )
Expand All @@ -788,6 +839,12 @@ public static String getHtmlString(HttpURLConnection conn, BufferedReader buffer
return null;
} finally {
htmlStrB.setLength(0); // Reset "StringBuilder" WITHOUT re-allocating.
try {
if ( inputStream != null )
inputStream.close();
} catch (IOException ioe) {
// Ignore.
}
}
}

Expand Down Expand Up @@ -878,9 +935,13 @@ public static DetectedContentType extractContentTypeFromResponseBody(HttpURLConn
return null;
}

InputStream inputStream = checkEncodingAndGetInputStream(conn, false);
if ( inputStream == null )
return null;

BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(conn.getInputStream()), FileUtils.fiveMb);
br = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8), FileUtils.fiveMb);
String inputLine;

// Skip empty lines in the beginning of the HTML-code
Expand Down Expand Up @@ -1052,7 +1113,6 @@ public static boolean haveOnlyProtocolDifference(String url1, String url2)

public static InputStream getInputStreamFromInputDataUrl()
{
InputStream inputStream = null;
if ( (PublicationsRetriever.inputDataUrl == null) || PublicationsRetriever.inputDataUrl.isEmpty() ) {
String errorMessage = "The \"inputDataUrl\" was not given, even though";
logger.error(errorMessage);
Expand All @@ -1061,6 +1121,7 @@ public static InputStream getInputStreamFromInputDataUrl()
System.exit(55);
}

InputStream inputStream = null;
try {
HttpURLConnection conn = HttpConnUtils.handleConnection(null, PublicationsRetriever.inputDataUrl, PublicationsRetriever.inputDataUrl, PublicationsRetriever.inputDataUrl, null, true, true);
String mimeType = conn.getHeaderField("Content-Type");
Expand All @@ -1072,7 +1133,12 @@ public static InputStream getInputStreamFromInputDataUrl()
System.exit(56);
}

inputStream = new BufferedInputStream(conn.getInputStream(), FileUtils.fiveMb);
inputStream = ConnSupportUtils.checkEncodingAndGetInputStream(conn, false);
if ( inputStream == null )
throw new RuntimeException("Could not acquire the InputStream!");

// Wrap it with a buffer, for increased efficiency.
inputStream = new BufferedInputStream(inputStream, FileUtils.fiveMb);

} catch (Exception e) {
String errorMessage = "Unexpected error when retrieving the input-stream from the inputDataUrl:\n" + e.getMessage();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,13 @@ public static HttpURLConnection openHttpConnection(String resourceURL, String do
conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty("User-Agent", userAgent);
conn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8");
//conn.setRequestProperty("Accept-Encoding", "gzip, deflate, br");

conn.setRequestProperty("Accept-Encoding", "gzip, deflate, br");
//conn.setRequestProperty("TE", "trailers"); // TODO - Investigate the "transfer-encoding" header.

if ( !domainsWithUnsupportedAcceptLanguageParameter.contains(domainStr) )
conn.setRequestProperty("Accept-Language", acceptLanguage);

conn.setRequestProperty("DNT", "1");
conn.setRequestProperty("Connection", "keep-alive");
conn.setRequestProperty("Sec-Fetch-Dest", "document");
Expand All @@ -339,8 +345,6 @@ public static HttpURLConnection openHttpConnection(String resourceURL, String do
conn.setRequestProperty("Cache-Control", "no-cache");
conn.setRequestProperty("Host", domainStr);

if ( !domainsWithUnsupportedAcceptLanguageParameter.contains(domainStr) )
conn.setRequestProperty("Accept-Language", acceptLanguage);
conn.setInstanceFollowRedirects(false); // We manage redirects on our own, in order to control redirectsNum, avoid redirecting to unwantedUrls and handling errors.

boolean useHttpGetMethod = false;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package eu.openaire.publications_retriever.test;

import com.google.common.collect.HashMultimap;
import crawlercommons.filters.basic.BasicURLNormalizer;
import eu.openaire.publications_retriever.PublicationsRetriever;
import eu.openaire.publications_retriever.crawler.PageCrawler;
import eu.openaire.publications_retriever.util.file.FileUtils;
Expand Down

0 comments on commit 65af29a

Please sign in to comment.