diff --git a/pom.xml b/pom.xml index 5e70345..d0c87d0 100644 --- a/pom.xml +++ b/pom.xml @@ -61,7 +61,7 @@ org.apache.maven.plugins maven-surefire-plugin - 3.1.2 + 3.2.1 @@ -127,7 +127,7 @@ commons-io commons-io - 2.14.0 + 2.15.0 diff --git a/src/main/java/eu/openaire/publications_retriever/crawler/MetaDocUrlsHandler.java b/src/main/java/eu/openaire/publications_retriever/crawler/MetaDocUrlsHandler.java index ca22f19..e8f7e03 100644 --- a/src/main/java/eu/openaire/publications_retriever/crawler/MetaDocUrlsHandler.java +++ b/src/main/java/eu/openaire/publications_retriever/crawler/MetaDocUrlsHandler.java @@ -80,7 +80,7 @@ public static boolean checkIfAndHandleMetaDocUrl(String urlId, String sourceUrl, // Block the domain and return "true" to indicate handled-state. HttpConnUtils.blacklistedDomains.add(pageDomain); logger.warn("Domain: \"" + pageDomain + "\" was blocked, after giving a dynamic metaDocUrl: " + metaDocUrl); - UrlUtils.logOutputData(urlId, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'MetaDocUrlsHandler.checkIfAndHandleMetaDocUrl()' method, as its metaDocUrl was a dynamic-link.", null, true, "true", "true", "false", "false", "false", null, "null"); // We log the source-url, and that was discarded in "PageCrawler.visit()". + UrlUtils.logOutputData(urlId, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'MetaDocUrlsHandler.checkIfAndHandleMetaDocUrl()' method, as its metaDocUrl was a dynamic-link.", null, true, "true", "true", "false", "false", "false", null, "null"); PageCrawler.contentProblematicUrls.incrementAndGet(); return true; // Since the domain is blocked, there is no point in continuing to crawl. } @@ -165,7 +165,6 @@ public static String getMetaDocUrlFromHTML(String pageHtml) return null; // It was not found and so it was not handled. We don't log the sourceUrl, since it will be handled later. //logger.debug("Matched meta-doc-url-line: " + metaDocUrlMatcher.group(0)); // DEBUG!! - String metaDocUrl = null; try { metaDocUrl = metaDocUrlMatcher.group(1); diff --git a/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java b/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java index d56bfc5..6b8b5ba 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java +++ b/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java @@ -770,8 +770,8 @@ public static String getHtmlString(HttpURLConnection conn, BufferedReader buffer String inputLine; while ( (inputLine = br.readLine()) != null ) { - if ( !inputLine.isEmpty() && (inputLine.length() != 1) && !SPACE_ONLY_LINE.matcher(inputLine).matches() ) { // We check for (inputLine.length() != 1), as some lines contain an unrecognized byte. - htmlStrB.append(inputLine); + if ( !inputLine.isEmpty() && (inputLine.length() != 1) && !SPACE_ONLY_LINE.matcher(inputLine).matches() ) { // We check for (inputLine.length() != 1), as some lines contain just an unrecognized byte. + htmlStrB.append(inputLine).append(" "); // Add a space where the line-break was, in order to not join words in text, which are separated by a new line in the html-code. //logger.debug(inputLine); // DEBUG! } } @@ -828,10 +828,13 @@ public static ArrayList detectContentTypeFromResponseBody(String finalUr mimeType = "application/pdf"; calledForPossibleDocUrl = true; // Important for the re-connection. foundDetectedContentType = true; + // The "bufferedReader" has already been closed. } else if ( detectedContentType.detectedContentType.equals("undefined") ) logger.debug("The url with the undeclared content type < " + finalUrlStr + " >, was examined and found to have UNDEFINED contentType."); + // The "bufferedReader" has already been closed. else warnMsg += "\nUnspecified \"detectedContentType\": " + detectedContentType.detectedContentType; + // Normally, we should never reach here. The BufferedReader should be null. } else // ( detectedContentType == null ) warnMsg += "\nCould not retrieve the response-body for url: " + finalUrlStr; @@ -840,6 +843,7 @@ public static ArrayList detectContentTypeFromResponseBody(String finalUr warnMsg += "\nThe initial connection was made with the \"HTTP-HEAD\" method, so there is no response-body to use to detect the content-type."; if ( !foundDetectedContentType && wasConnectedWithHTTPGET ) { // If it could be detected (by using the "GET" method to take the response-body), but it was not, only then go and check if it should be blocked. + // The BufferedReader should be null here. if ( ConnSupportUtils.countAndBlockDomainAfterTimes(HttpConnUtils.blacklistedDomains, HttpConnUtils.timesDomainsReturnedNoType, domainStr, timesToReturnNoTypeBeforeDomainBlocked, true) ) { logger.warn(warnMsg); logger.warn("Domain: \"" + domainStr + "\" was blocked after returning no Type-info more than " + timesToReturnNoTypeBeforeDomainBlocked + " times."); @@ -888,10 +892,12 @@ public static DetectedContentType extractContentTypeFromResponseBody(HttpURLConn logger.debug(Arrays.toString(inputLine.chars().toArray())); }*/ - //logger.debug("First line of RequestBody: " + inputLine); // DEBUG! + // Check if the stream ended before we could find an "accepted" line. if ( inputLine == null ) return null; + //logger.debug("First actual line of RequestBody: " + inputLine); // DEBUG! + String lowerCaseInputLine = inputLine.toLowerCase(); //logger.debug(lowerCaseInputLine + "\nLength of line: " + lowerCaseInputLine.length()); // DEBUG! if ( HTML_STRING_MATCH.matcher(lowerCaseInputLine).matches() ) @@ -989,12 +995,8 @@ public static String getFullyFormedUrl(String pageUrl, String currentLink, URL u public static boolean isJustAnHTTPSredirect(String currentUrl, String targetUrl) { - // First check if we go from an http to an https in general. - if ( !currentUrl.startsWith("http://", 0) && !targetUrl.startsWith("https://", 0) ) - return false; - - // Take the url after the protocol and check if it's the same, if it is then we have our HTTPS redirect, if not then it's another type of redirect. - return haveOnlyProtocolDifference(currentUrl, targetUrl); + return ( currentUrl.startsWith("http://", 0) && targetUrl.startsWith("https://", 0) + && haveOnlyProtocolDifference(currentUrl, targetUrl) ); } diff --git a/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java b/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java index f89a738..dc9fd73 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java +++ b/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java @@ -34,7 +34,7 @@ public class UrlTypeChecker + "|(?:(?:advanced[-]?)?search|search/advanced|search-results|(?:[e]?books|journals)(?:-catalog)?|issue|docs|oai|(?:abstracting-)?indexing|online[-]?early|honors|awards|meetings|calendar|diversity|scholarships|invo(?:ice|lved)|errata|classroom|publish(?:-with-us)?|upload|products|forgot|home|ethics|comics|podcast|trends|bestof|booksellers|recommendations|bibliographic|volume[s]?)[/]?$" // Url ends with these. Note that some of them are likely to be part of a docUrl, for ex. the "/trends/"-dir. // TODO - In case we have just DocUrls (not datasetUrls), exclude the following as well: "/(?:bibtext|dc(?:terms)?|tei|endnote)$", it could be added in another regex.. or do an initialization check and construct this regex based on the url-option provided. + "|rights[-]?permissions|publication[-]?ethics|advertising|reset[-]?password|\\*/|communit(?:y|ies)" - + "|restricted|noaccess|crawlprevention|error|(?:mis|ab)use|\\?denied|gateway|defaultwebpage|sorryserver|cookie|notfound|(?:404|accessibility|invalid|catalog(?:ue|ar|o)?)\\." + htOrPhpExtensionsPattern + ").*"); + + "|restricted|noaccess|crawlprevention|error|(?:mis|ab)use|\\?denied|gateway|defaultwebpage|sorryserver|cookie|(?:page-)?not[-]?found|(?:404(?:_response)|accessibility|invalid|catalog(?:ue|ar|o)?)\\." + htOrPhpExtensionsPattern + ").*"); // We check them as a directory to avoid discarding publications' urls about these subjects. There's "acesso" (single "c") in Portuguese.. Also there's "autore" & "contatto" in Italian.