Skip to content

Commit

Permalink
- Avoid double-logging in "PageCrawler.verifyDocLink()".
Browse files Browse the repository at this point in the history
- Code polishing.
  • Loading branch information
LSmyrnaios committed Feb 12, 2024
1 parent d8f3ce8 commit f95123d
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ public static synchronized boolean shouldRunPrediction()
/**
* This method tries to predict the docUrl of a page, if this page gives us the ID of the document, based on previous success cases.
* The idea is that we might get a url which shows info about the publication and has the same ID with the wanted docUrl, but it just happens to be in a different directory (path).
* So, before going and checking each and every one of the internal links, we should check if by using known docUrl-paths that gave docUrls before (for the current pageUrl-path), we are able to retrieve the docUrl immediately.
* So, before going and checking each one of the internal links, we should check if by using known docUrl-paths that gave docUrls before (for the current pageUrl-path), we are able to retrieve the docUrl immediately.
* @param urlId
* @param sourceUrl
* @param pageUrl
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ public class PageCrawler
+ "|ープンアクセス方針|(?:刊行物|個人)単位登録).*"); // "Open access policy" in Japanese, "Publication unit registration"

// Example of docUrl having the "editorial" keyword: https://publikationen.ub.uni-frankfurt.de/opus4/frontdoor/deliver/index/docId/45461/file/daek_et_al_2017_editorial.pdf
// Example of docUrl having the "review" keyword: https://jag.journalagent.com/tkd/pdfs/TKDA-33903-INVITED_REVIEW-ASLAN.pdf


public static void visit(String urlId, String sourceUrl, String pageUrl, String pageContentType, HttpURLConnection conn, String firstHTMLlineFromDetectedContentType, BufferedReader bufferedReader)
{
Expand Down Expand Up @@ -602,10 +604,8 @@ public static boolean verifyDocLink(String urlId, String sourceUrl, String pageU
return false;
}
return true;
} catch (Exception e) { // After connecting to the metaDocUrl.
logger.warn("The DocLink < " + docLink + " > was not reached!");
if (e instanceof RuntimeException)
ConnSupportUtils.printEmbeddedExceptionMessage(e, docLink);
} catch (Exception e) { // After connecting to the possibleDocLink.
logger.warn("The DocLink < " + docLink + " > was not reached!"); // The specific error has already been written inside the called method.
UrlUtils.logOutputData(urlId, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.visit()' method, as the retrieved DocLink: < " + docLink + " > had connectivity problems.", null, true, "true", "true", "false", "false", "false", null, "null");
return false;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,7 @@ public static String hasDocOrDatasetMimeType(String urlStr, String mimeType, Str
contentDisposition = contentDisposition.toLowerCase();
if ( !contentDisposition.equals("attachment") )
typeToReturn = contentDisposition.contains(".pdf") ? "document" : null; // TODO - add more types as needed. Check: "http://www.esocialsciences.org/Download/repecDownload.aspx?qs=Uqn/rN48N8UOPcbSXUd2VFI+dpOD3MDPRfIL8B3DH+6L18eo/yEvpYEkgi9upp2t8kGzrjsWQHUl44vSn/l7Uc1SILR5pVtxv8VYECXSc8pKLF6QJn6MioA5dafPj/8GshHBvLyCex2df4aviMvImCZpwMHvKoPiO+4B7yHRb97u1IHg45E+Z6ai0Z/0vacWHoCsNT9O4FNZKMsSzen2Cw=="
}
else
} else
typeToReturn = urlStr.toLowerCase().contains("pdf") ? "document" : null;
}
return typeToReturn; // It may be null.
Expand Down Expand Up @@ -1204,7 +1203,7 @@ public static long getRandomNumber(int min, int max) {
public static String getWasDirectLink(String sourceUrl, String pageUrl, boolean calledForPageUrl, String finalUrlStr) {
String wasDirectLink;
if ( calledForPageUrl ) {
boolean isSpecialUrl = HttpConnUtils.isSpecialUrl.get(); // It's more efficient to save it once in a temp-variable.
boolean isSpecialUrl = HttpConnUtils.isSpecialUrl.get();
if ( (!isSpecialUrl && ( pageUrl.equals(finalUrlStr) || ConnSupportUtils.haveOnlyProtocolDifference(pageUrl, finalUrlStr) ))
|| sourceUrl.equals(finalUrlStr) || ConnSupportUtils.haveOnlyProtocolDifference(sourceUrl, finalUrlStr)) // Or if it was not a "specialUrl" and the pageUrl is the same as the docUrl.
wasDirectLink = "true";
Expand Down

0 comments on commit f95123d

Please sign in to comment.