Skip to content

Commit

Permalink
- Fix concatenating some words when acquiring the HTML-code.
Browse files Browse the repository at this point in the history
- Avoid some problematic urls.
- Update dependencies.
- Code polishing.
  • Loading branch information
LSmyrnaios committed Nov 2, 2023
1 parent f8ace6b commit 4edfd67
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 15 deletions.
6 changes: 3 additions & 3 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>3.1.2</version>
<version>3.2.1</version>
<configuration>
<!--<excludes>
<exclude>some test to exclude here</exclude>
Expand Down Expand Up @@ -106,7 +106,7 @@
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.16.1</version>
<version>1.16.2</version>
</dependency>

<!-- https://mvnrepository.com/artifact/com.google.guava/guava -->
Expand All @@ -127,7 +127,7 @@
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.14.0</version>
<version>2.15.0</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.json/json -->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ public static boolean checkIfAndHandleMetaDocUrl(String urlId, String sourceUrl,
// Block the domain and return "true" to indicate handled-state.
HttpConnUtils.blacklistedDomains.add(pageDomain);
logger.warn("Domain: \"" + pageDomain + "\" was blocked, after giving a dynamic metaDocUrl: " + metaDocUrl);
UrlUtils.logOutputData(urlId, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'MetaDocUrlsHandler.checkIfAndHandleMetaDocUrl()' method, as its metaDocUrl was a dynamic-link.", null, true, "true", "true", "false", "false", "false", null, "null"); // We log the source-url, and that was discarded in "PageCrawler.visit()".
UrlUtils.logOutputData(urlId, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'MetaDocUrlsHandler.checkIfAndHandleMetaDocUrl()' method, as its metaDocUrl was a dynamic-link.", null, true, "true", "true", "false", "false", "false", null, "null");
PageCrawler.contentProblematicUrls.incrementAndGet();
return true; // Since the domain is blocked, there is no point in continuing to crawl.
}
Expand Down Expand Up @@ -165,7 +165,6 @@ public static String getMetaDocUrlFromHTML(String pageHtml)
return null; // It was not found and so it was not handled. We don't log the sourceUrl, since it will be handled later.

//logger.debug("Matched meta-doc-url-line: " + metaDocUrlMatcher.group(0)); // DEBUG!!

String metaDocUrl = null;
try {
metaDocUrl = metaDocUrlMatcher.group(1);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -770,8 +770,8 @@ public static String getHtmlString(HttpURLConnection conn, BufferedReader buffer
String inputLine;
while ( (inputLine = br.readLine()) != null )
{
if ( !inputLine.isEmpty() && (inputLine.length() != 1) && !SPACE_ONLY_LINE.matcher(inputLine).matches() ) { // We check for (inputLine.length() != 1), as some lines contain an unrecognized byte.
htmlStrB.append(inputLine);
if ( !inputLine.isEmpty() && (inputLine.length() != 1) && !SPACE_ONLY_LINE.matcher(inputLine).matches() ) { // We check for (inputLine.length() != 1), as some lines contain just an unrecognized byte.
htmlStrB.append(inputLine).append(" "); // Add a space where the line-break was, in order to not join words in text, which are separated by a new line in the html-code.
//logger.debug(inputLine); // DEBUG!
}
}
Expand Down Expand Up @@ -828,10 +828,13 @@ public static ArrayList<Object> detectContentTypeFromResponseBody(String finalUr
mimeType = "application/pdf";
calledForPossibleDocUrl = true; // Important for the re-connection.
foundDetectedContentType = true;
// The "bufferedReader" has already been closed.
} else if ( detectedContentType.detectedContentType.equals("undefined") )
logger.debug("The url with the undeclared content type < " + finalUrlStr + " >, was examined and found to have UNDEFINED contentType.");
// The "bufferedReader" has already been closed.
else
warnMsg += "\nUnspecified \"detectedContentType\": " + detectedContentType.detectedContentType;
// Normally, we should never reach here. The BufferedReader should be null.
}
else // ( detectedContentType == null )
warnMsg += "\nCould not retrieve the response-body for url: " + finalUrlStr;
Expand All @@ -840,6 +843,7 @@ public static ArrayList<Object> detectContentTypeFromResponseBody(String finalUr
warnMsg += "\nThe initial connection was made with the \"HTTP-HEAD\" method, so there is no response-body to use to detect the content-type.";

if ( !foundDetectedContentType && wasConnectedWithHTTPGET ) { // If it could be detected (by using the "GET" method to take the response-body), but it was not, only then go and check if it should be blocked.
// The BufferedReader should be null here.
if ( ConnSupportUtils.countAndBlockDomainAfterTimes(HttpConnUtils.blacklistedDomains, HttpConnUtils.timesDomainsReturnedNoType, domainStr, timesToReturnNoTypeBeforeDomainBlocked, true) ) {
logger.warn(warnMsg);
logger.warn("Domain: \"" + domainStr + "\" was blocked after returning no Type-info more than " + timesToReturnNoTypeBeforeDomainBlocked + " times.");
Expand Down Expand Up @@ -888,10 +892,12 @@ public static DetectedContentType extractContentTypeFromResponseBody(HttpURLConn
logger.debug(Arrays.toString(inputLine.chars().toArray()));
}*/

//logger.debug("First line of RequestBody: " + inputLine); // DEBUG!
// Check if the stream ended before we could find an "accepted" line.
if ( inputLine == null )
return null;

//logger.debug("First actual line of RequestBody: " + inputLine); // DEBUG!

String lowerCaseInputLine = inputLine.toLowerCase();
//logger.debug(lowerCaseInputLine + "\nLength of line: " + lowerCaseInputLine.length()); // DEBUG!
if ( HTML_STRING_MATCH.matcher(lowerCaseInputLine).matches() )
Expand Down Expand Up @@ -989,12 +995,8 @@ public static String getFullyFormedUrl(String pageUrl, String currentLink, URL u

public static boolean isJustAnHTTPSredirect(String currentUrl, String targetUrl)
{
// First check if we go from an http to an https in general.
if ( !currentUrl.startsWith("http://", 0) && !targetUrl.startsWith("https://", 0) )
return false;

// Take the url after the protocol and check if it's the same, if it is then we have our HTTPS redirect, if not then it's another type of redirect.
return haveOnlyProtocolDifference(currentUrl, targetUrl);
return ( currentUrl.startsWith("http://", 0) && targetUrl.startsWith("https://", 0)
&& haveOnlyProtocolDifference(currentUrl, targetUrl) );
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ public class UrlTypeChecker
+ "|(?:(?:advanced[-]?)?search|search/advanced|search-results|(?:[e]?books|journals)(?:-catalog)?|issue|docs|oai|(?:abstracting-)?indexing|online[-]?early|honors|awards|meetings|calendar|diversity|scholarships|invo(?:ice|lved)|errata|classroom|publish(?:-with-us)?|upload|products|forgot|home|ethics|comics|podcast|trends|bestof|booksellers|recommendations|bibliographic|volume[s]?)[/]?$" // Url ends with these. Note that some of them are likely to be part of a docUrl, for ex. the "/trends/"-dir.
// TODO - In case we have just DocUrls (not datasetUrls), exclude the following as well: "/(?:bibtext|dc(?:terms)?|tei|endnote)$", it could be added in another regex.. or do an initialization check and construct this regex based on the url-option provided.
+ "|rights[-]?permissions|publication[-]?ethics|advertising|reset[-]?password|\\*/|communit(?:y|ies)"
+ "|restricted|noaccess|crawlprevention|error|(?:mis|ab)use|\\?denied|gateway|defaultwebpage|sorryserver|cookie|notfound|(?:404|accessibility|invalid|catalog(?:ue|ar|o)?)\\." + htOrPhpExtensionsPattern + ").*");
+ "|restricted|noaccess|crawlprevention|error|(?:mis|ab)use|\\?denied|gateway|defaultwebpage|sorryserver|cookie|(?:page-)?not[-]?found|(?:404(?:_response)|accessibility|invalid|catalog(?:ue|ar|o)?)\\." + htOrPhpExtensionsPattern + ").*");

// We check them as a directory to avoid discarding publications' urls about these subjects. There's "acesso" (single "c") in Portuguese.. Also there's "autore" & "contatto" in Italian.

Expand Down

0 comments on commit 4edfd67

Please sign in to comment.