diff --git a/pom.xml b/pom.xml index 48ae523..d9cf90e 100644 --- a/pom.xml +++ b/pom.xml @@ -116,6 +116,20 @@ 32.1.3-jre + + + org.apache.commons + commons-compress + 1.25.0 + + + + + org.brotli + dec + 0.1.2 + + org.apache.commons diff --git a/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java b/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java index 61145ec..38e7b13 100644 --- a/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java +++ b/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java @@ -88,7 +88,7 @@ public static void visit(String urlId, String sourceUrl, String pageUrl, String } String pageHtml = null; // Get the pageHtml to parse the page. - if ( (pageHtml = ConnSupportUtils.getHtmlString(conn, bufferedReader)) == null ) { + if ( (pageHtml = ConnSupportUtils.getHtmlString(conn, bufferedReader, false)) == null ) { logger.warn("Could not retrieve the HTML-code for pageUrl: " + pageUrl); UrlUtils.logOutputData(urlId, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.visit()' method, as there was a problem retrieving its HTML-code. Its contentType is: '" + pageContentType + "'.", null, true, "true", "true", "false", "false", "true", null, "null"); LoaderAndChecker.connProblematicUrls.incrementAndGet(); diff --git a/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java b/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java index 54aec23..96a7260 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java +++ b/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java @@ -5,6 +5,7 @@ import eu.openaire.publications_retriever.PublicationsRetriever; import eu.openaire.publications_retriever.crawler.MachineLearning; import eu.openaire.publications_retriever.exceptions.DocFileNotRetrievedException; +import eu.openaire.publications_retriever.util.http.ConnSupportUtils; import eu.openaire.publications_retriever.util.url.DataToBeLogged; import eu.openaire.publications_retriever.util.url.LoaderAndChecker; import eu.openaire.publications_retriever.util.url.UrlUtils; @@ -349,9 +350,14 @@ else if ( docFileNameType.equals(DocFileNameType.originalName) ) numOfDocFiles.incrementAndGet(); File docFile = docFileData.getDocFile(); + + InputStream inputStream = ConnSupportUtils.checkEncodingAndGetInputStream(conn, false); + if ( inputStream == null ) + throw new DocFileNotRetrievedException("Could not acquire the inputStream!"); + FileOutputStream fileOutputStream = docFileData.getFileOutputStream(); - try ( BufferedInputStream inStream = new BufferedInputStream(conn.getInputStream(), fiveMb); + try ( BufferedInputStream inStream = new BufferedInputStream(inputStream, fiveMb); BufferedOutputStream outStream = new BufferedOutputStream(((fileOutputStream != null) ? fileOutputStream : new FileOutputStream(docFile)), fiveMb) ) { int maxStoringWaitingTime = getMaxStoringWaitingTime(contentSize); @@ -442,7 +448,11 @@ public static synchronized DocFileData storeDocFileWithNumberName(HttpURLConnect File docFile = new File(storeDocFilesDir + (numOfDocFile++) + ".pdf"); // First use the "numOfDocFile" and then increment it. // TODO - Later, on different fileTypes, take care of the extension properly. - try ( BufferedInputStream inStream = new BufferedInputStream(conn.getInputStream(), fiveMb); + InputStream inputStream = ConnSupportUtils.checkEncodingAndGetInputStream(conn, false); + if ( inputStream == null ) + throw new DocFileNotRetrievedException("Could not acquire the inputStream!"); + + try ( BufferedInputStream inStream = new BufferedInputStream(inputStream, fiveMb); BufferedOutputStream outStream = new BufferedOutputStream(new FileOutputStream(docFile), fiveMb)) { int maxStoringWaitingTime = getMaxStoringWaitingTime(contentSize); diff --git a/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java b/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java index 5f43af0..d2f9e1e 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java +++ b/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java @@ -14,6 +14,9 @@ import eu.openaire.publications_retriever.util.file.IdUrlTuple; import eu.openaire.publications_retriever.util.url.LoaderAndChecker; import eu.openaire.publications_retriever.util.url.UrlUtils; +import org.apache.commons.compress.compressors.brotli.BrotliCompressorInputStream; +import org.apache.commons.compress.compressors.deflate.DeflateCompressorInputStream; +import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; import org.apache.commons.io.FileDeleteStrategy; import org.apache.commons.lang3.StringUtils; @@ -24,6 +27,7 @@ import java.io.*; import java.net.HttpURLConnection; import java.net.URL; +import java.nio.charset.StandardCharsets; import java.time.Duration; import java.time.Instant; import java.util.*; @@ -435,7 +439,7 @@ public static String getInternalLinkFromHTTP300Page(HttpURLConnection conn) { try { String html = null; - if ( (html = ConnSupportUtils.getHtmlString(conn, null)) == null ) { + if ( (html = ConnSupportUtils.getHtmlString(conn, null, false)) == null ) { logger.warn("Could not retrieve the HTML-code for HTTP300PageUrl: " + conn.getURL().toString()); return null; } @@ -517,36 +521,76 @@ public static String onErrorStatusCode(String urlStr, String domainStr, int erro } - public static String getErrorMessageFromResponseBody(HttpURLConnection conn) + public static InputStream checkEncodingAndGetInputStream(HttpURLConnection conn, boolean isForError) { - final StringBuilder msgStrB = new StringBuilder(500); - try ( InputStream inputStream = conn.getErrorStream() ) { - if ( inputStream == null ) // No error-data is provided. + InputStream inputStream = null; + try { + inputStream = (isForError ? conn.getErrorStream() : conn.getInputStream()); + if ( isForError && (inputStream == null) ) // Only the "getErrorStream" may return null. return null; + } catch (Exception e) { + logger.error("", e); + return null; + } + // Determine the potential encoding + String encoding = conn.getHeaderField("content-encoding"); + if ( encoding != null ) { + String url = conn.getURL().toString(); + if ( logger.isTraceEnabled() ) + logger.trace("Url \"" + url + "\" has content-encoding: " + encoding); + InputStream compressedInputStream = getCompressedInputStream(inputStream, encoding, url, isForError); + if ( compressedInputStream == null ) { + try { + inputStream.close(); + } catch (IOException ioe) {} + return null; // The error is logged inside the called method. + } + inputStream = compressedInputStream; + } - try ( BufferedReader br = new BufferedReader(new InputStreamReader(inputStream)) ) { - String inputLine; - while ( (inputLine = br.readLine()) != null ) { // Returns the line, without the ending-line characters. - if ( !inputLine.isEmpty() ) - msgStrB.append(inputLine).append(" "); // We want a single finale line, not a multi-line text. - } - - if ( msgStrB.length() == 0 ) - return null; // Make sure we return a "null" on empty string, to better handle the case in the caller-function. + return inputStream; + } - String errorText = Jsoup.parse(msgStrB.toString()).text(); // It's already "trimmed". - if ( errorText.length() == 0 ) - return null; - return errorText; - } catch (IOException ioe) { - logger.error("IOException when retrieving the response-body: " + ioe.getMessage()); + public static InputStream getCompressedInputStream(InputStream inputStream, String encoding, String url, boolean isForError) + { + InputStream compressedInputStream; + try { + if ( encoding.equals("gzip") ) + compressedInputStream = new GzipCompressorInputStream(inputStream); + else if ( encoding.equals("deflate") ) + compressedInputStream = new DeflateCompressorInputStream(inputStream); + else if ( encoding.equals("br") ) + compressedInputStream = new BrotliCompressorInputStream(inputStream); + else { + logger.warn("An unsupported \"content-encoding\" (" + encoding + ") was received from url: " + url); return null; } - } catch (Exception e) { - logger.error("Could not extract the response-body!", e); + } catch (IOException ioe) { + String exMsg = ioe.getMessage(); + if ( exMsg.startsWith("Input is not in the") ) + logger.warn(exMsg + " | http-published-encoding: " + encoding + " | url: " + url); + // Some urls do not return valid html-either way. + else + logger.error("Could not acquire the compressorInputStream for encoding: " + encoding + " | url: " + url, ioe); return null; } + return compressedInputStream; + } + + + public static String getErrorMessageFromResponseBody(HttpURLConnection conn) + { + String html = getHtmlString(conn, null, true); + if ( html == null ) + return null; + + int htmlLength = html.length(); + if ( (htmlLength == 0) || (htmlLength > 10000) ) + return null; + + String errorText = Jsoup.parse(html).text(); // The result is already "trimmed". + return ((errorText.length() > 0) ? errorText : null); } @@ -752,7 +796,7 @@ public static List blockSharedSiteSessionDomains(String targetUrl, Strin public static ThreadLocal htmlStrBuilder = new ThreadLocal(); // Every Thread has its own variable. - public static String getHtmlString(HttpURLConnection conn, BufferedReader bufferedReader) + public static String getHtmlString(HttpURLConnection conn, BufferedReader bufferedReader, boolean isForError) { if ( getContentSize(conn, false) == -1 ) { // "Unacceptable size"-code.. logger.warn("Aborting HTML-extraction for pageUrl: " + conn.getURL().toString()); @@ -766,7 +810,14 @@ public static String getHtmlString(HttpURLConnection conn, BufferedReader buffer htmlStrBuilder.set(htmlStrB); // Save it for future use by this thread. } - try (BufferedReader br = (bufferedReader != null ? bufferedReader : new BufferedReader(new InputStreamReader(conn.getInputStream()), FileUtils.fiveMb)) ) // Try-with-resources + InputStream inputStream = null; + if ( bufferedReader == null ) { + inputStream = checkEncodingAndGetInputStream(conn, isForError); + if ( inputStream == null ) // The error is already logged inside. + return null; + } + + try (BufferedReader br = ((bufferedReader != null) ? bufferedReader : new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8), FileUtils.fiveMb)) ) // Try-with-resources { String inputLine; while ( (inputLine = br.readLine()) != null ) @@ -788,6 +839,12 @@ public static String getHtmlString(HttpURLConnection conn, BufferedReader buffer return null; } finally { htmlStrB.setLength(0); // Reset "StringBuilder" WITHOUT re-allocating. + try { + if ( inputStream != null ) + inputStream.close(); + } catch (IOException ioe) { + // Ignore. + } } } @@ -878,9 +935,13 @@ public static DetectedContentType extractContentTypeFromResponseBody(HttpURLConn return null; } + InputStream inputStream = checkEncodingAndGetInputStream(conn, false); + if ( inputStream == null ) + return null; + BufferedReader br = null; try { - br = new BufferedReader(new InputStreamReader(conn.getInputStream()), FileUtils.fiveMb); + br = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8), FileUtils.fiveMb); String inputLine; // Skip empty lines in the beginning of the HTML-code @@ -1052,7 +1113,6 @@ public static boolean haveOnlyProtocolDifference(String url1, String url2) public static InputStream getInputStreamFromInputDataUrl() { - InputStream inputStream = null; if ( (PublicationsRetriever.inputDataUrl == null) || PublicationsRetriever.inputDataUrl.isEmpty() ) { String errorMessage = "The \"inputDataUrl\" was not given, even though"; logger.error(errorMessage); @@ -1061,6 +1121,7 @@ public static InputStream getInputStreamFromInputDataUrl() System.exit(55); } + InputStream inputStream = null; try { HttpURLConnection conn = HttpConnUtils.handleConnection(null, PublicationsRetriever.inputDataUrl, PublicationsRetriever.inputDataUrl, PublicationsRetriever.inputDataUrl, null, true, true); String mimeType = conn.getHeaderField("Content-Type"); @@ -1072,7 +1133,12 @@ public static InputStream getInputStreamFromInputDataUrl() System.exit(56); } - inputStream = new BufferedInputStream(conn.getInputStream(), FileUtils.fiveMb); + inputStream = ConnSupportUtils.checkEncodingAndGetInputStream(conn, false); + if ( inputStream == null ) + throw new RuntimeException("Could not acquire the InputStream!"); + + // Wrap it with a buffer, for increased efficiency. + inputStream = new BufferedInputStream(inputStream, FileUtils.fiveMb); } catch (Exception e) { String errorMessage = "Unexpected error when retrieving the input-stream from the inputDataUrl:\n" + e.getMessage(); diff --git a/src/main/java/eu/openaire/publications_retriever/util/http/HttpConnUtils.java b/src/main/java/eu/openaire/publications_retriever/util/http/HttpConnUtils.java index 0765236..81f3a3c 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/http/HttpConnUtils.java +++ b/src/main/java/eu/openaire/publications_retriever/util/http/HttpConnUtils.java @@ -328,7 +328,13 @@ public static HttpURLConnection openHttpConnection(String resourceURL, String do conn = (HttpURLConnection) url.openConnection(); conn.setRequestProperty("User-Agent", userAgent); conn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"); - //conn.setRequestProperty("Accept-Encoding", "gzip, deflate, br"); + + conn.setRequestProperty("Accept-Encoding", "gzip, deflate, br"); + //conn.setRequestProperty("TE", "trailers"); // TODO - Investigate the "transfer-encoding" header. + + if ( !domainsWithUnsupportedAcceptLanguageParameter.contains(domainStr) ) + conn.setRequestProperty("Accept-Language", acceptLanguage); + conn.setRequestProperty("DNT", "1"); conn.setRequestProperty("Connection", "keep-alive"); conn.setRequestProperty("Sec-Fetch-Dest", "document"); @@ -339,8 +345,6 @@ public static HttpURLConnection openHttpConnection(String resourceURL, String do conn.setRequestProperty("Cache-Control", "no-cache"); conn.setRequestProperty("Host", domainStr); - if ( !domainsWithUnsupportedAcceptLanguageParameter.contains(domainStr) ) - conn.setRequestProperty("Accept-Language", acceptLanguage); conn.setInstanceFollowRedirects(false); // We manage redirects on our own, in order to control redirectsNum, avoid redirecting to unwantedUrls and handling errors. boolean useHttpGetMethod = false; diff --git a/src/test/java/eu/openaire/publications_retriever/test/UrlChecker.java b/src/test/java/eu/openaire/publications_retriever/test/UrlChecker.java index 065d765..756fa72 100644 --- a/src/test/java/eu/openaire/publications_retriever/test/UrlChecker.java +++ b/src/test/java/eu/openaire/publications_retriever/test/UrlChecker.java @@ -1,7 +1,6 @@ package eu.openaire.publications_retriever.test; import com.google.common.collect.HashMultimap; -import crawlercommons.filters.basic.BasicURLNormalizer; import eu.openaire.publications_retriever.PublicationsRetriever; import eu.openaire.publications_retriever.crawler.PageCrawler; import eu.openaire.publications_retriever.util.file.FileUtils;