Skip to content

Commit

Permalink
- Fix duplicate output when a record was discarded during redirection…
Browse files Browse the repository at this point in the history
…s, after matching to regex-rules.

- Fix an issue, where in some error-cases, no value was assigned for the "pageUrl" filed, thus resulting in some of the results missing this field altogether.
  • Loading branch information
LSmyrnaios committed Dec 15, 2024
1 parent 699c0c0 commit 76d2889
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -541,7 +541,7 @@ public static HttpURLConnection handleRedirects(String urlId, String sourceUrl,
String lowerCaseTargetUrl = targetUrl.toLowerCase();
if ( (calledForPageUrl && UrlTypeChecker.shouldNotAcceptPageUrl(urlId, sourceUrl, targetUrl, lowerCaseTargetUrl, calledForPageUrl)) // Redirecting a pageUrl.
|| (!calledForPageUrl && UrlTypeChecker.shouldNotAcceptInternalLink(targetUrl, lowerCaseTargetUrl)) ) // Redirecting an internalPageLink.
throw new RuntimeException("Url: \"" + initialUrl + "\" was prevented to redirect to the unwanted location: \"" + targetUrl + "\", after receiving an \"HTTP " + responseCode + "\" Redirect Code, in redirection-number: " + curRedirectsNum);
throw new RuntimeException("Url: \"" + initialUrl + "\" was prevented to redirect to the unwanted location: \"" + targetUrl + "\", after receiving an \"HTTP " + responseCode + "\" Redirect Code, in redirection-number: " + curRedirectsNum + (calledForPageUrl ? " | " + LoaderAndChecker.alreadyLoggedMessage : ""));
else if ( lowerCaseTargetUrl.contains("sharedsitesession") ) { // either "getSharedSiteSession" or "consumeSharedSiteSession".
logger.warn("Initial-url: \"" + initialUrl + "\" tried to cause a \"sharedSiteSession-redirectionPack\" by redirecting to \"" + targetUrl + "\"!");
List<String> blockedDomains = ConnSupportUtils.blockSharedSiteSessionDomains(targetUrl, currentUrl);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ public class LoaderAndChecker
+ "|[p]?cap|dmp|vcf|cbor|biosample|hic|warc|ig[e]?s|sla|dxf|pdb|[sc]df|cif|f(?:ast)?[qa]|apng|sra|vtp|gltf|[sm]tl|ply|abc|md|rtf|ttl|shp|shx|exr|cdf|glb|mtl|kmz|textFile)";
public static final Pattern DATASET_URL_FILTER = Pattern.compile(".+(?:dataset[s]?/.*|(?:\\.|format=)" + dataset_formats + "(?:\\?.+)?$)");

public static final String alreadyLoggedMessage = "__LOGGED__";

public static final BasicURLNormalizer basicURLNormalizer = BasicURLNormalizer.newBuilder().build();

Expand Down Expand Up @@ -123,7 +124,7 @@ public static void loadAndCheckUrls() throws RuntimeException
String urlToCheck = retrievedUrlToCheck;
if ( (urlToCheck = basicURLNormalizer.filter(retrievedUrlToCheck)) == null ) {
logger.warn("Could not normalize url: " + retrievedUrlToCheck);
UrlUtils.addOutputData("null", retrievedUrlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to normalization's problems.", null, true, "true", "false", "false", "false", "false", null, "null", "N/A");
UrlUtils.addOutputData("null", retrievedUrlToCheck, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to normalization's problems.", null, true, "true", "false", "false", "false", "false", null, "null", "N/A");
LoaderAndChecker.connProblematicUrls.incrementAndGet();
return false;
}
Expand All @@ -142,11 +143,18 @@ public static void loadAndCheckUrls() throws RuntimeException
try { // We sent the < null > into quotes to avoid causing NPEs in the thread-safe datastructures that do not support null input.
HttpConnUtils.connectAndCheckMimeType("null", retrievedUrlToCheck, urlToCheck, urlToCheck, null, true, isPossibleDocOrDatasetUrl);
} catch (Exception e) {

if ( e instanceof RuntimeException ) {
String msg = e.getMessage();
if ( (msg != null) && msg.contains(alreadyLoggedMessage) )
return false; // The error has already been logged in better detail.
}

List<String> list = getWasValidAndCouldRetry(e, urlToCheck);
String wasUrlValid = list.get(0);
String couldRetry = list.get(1);
String errorMsg = "Discarded at loading time, as " + list.get(2);
UrlUtils.addOutputData("null", retrievedUrlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null", "N/A");
UrlUtils.addOutputData("null", retrievedUrlToCheck, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null", "N/A");
return false;
}
return true;
Expand Down Expand Up @@ -266,7 +274,7 @@ else if ( neutralUrl != null )
String sourceUrl = urlToCheck; // Hold it here for the logging-messages.
if ( (urlToCheck = basicURLNormalizer.filter(sourceUrl)) == null ) {
logger.warn("Could not normalize url: " + sourceUrl);
UrlUtils.addOutputData(retrievedId, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to normalization's problems.", null, true, "true", "false", "false", "false", "false", null, "null", "N/A");
UrlUtils.addOutputData(retrievedId, sourceUrl, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to normalization's problems.", null, true, "true", "false", "false", "false", "false", null, "null", "N/A");
LoaderAndChecker.connProblematicUrls.incrementAndGet();

// If other urls exits, then go and check those.
Expand All @@ -285,11 +293,18 @@ else if ( neutralUrl != null )
loggedUrlsOfCurrentId.add(urlToCheck);
// Here the runnable was successful in any case.
} catch (Exception e) {

if ( e instanceof RuntimeException ) {
String msg = e.getMessage();
if ( (msg != null) && msg.contains(alreadyLoggedMessage) )
return false; // The error has already been logged in better detail.
}

List<String> list = getWasValidAndCouldRetry(e, urlToCheck);
String wasUrlValid = list.get(0);
String couldRetry = list.get(1);
String errorMsg = "Discarded at loading time, as " + list.get(2);
UrlUtils.addOutputData(retrievedId, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null", "N/A");
UrlUtils.addOutputData(retrievedId, urlToCheck, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null", "N/A");
// This url had connectivity problems.. but the rest might not, go check them out.
if ( !isSingleIdUrlPair ) {
loggedUrlsOfCurrentId.add(urlToCheck);
Expand Down Expand Up @@ -346,14 +361,14 @@ public static void loadAndCheckEachIdUrlPairInEntries() throws RuntimeException
String retrievedUrl = pair.getValue();

if ( (retrievedUrl = handleUrlChecks(retrievedId, retrievedUrl)) == null ) {
return false;
return false; // The logging happens inside.
} // The "retrievedUrl" might have changed (inside "handleUrlChecks()").

String urlToCheck = retrievedUrl;
String sourceUrl = urlToCheck; // Hold it here for the logging-messages.
if ( (urlToCheck = basicURLNormalizer.filter(sourceUrl)) == null ) {
logger.warn("Could not normalize url: " + sourceUrl);
UrlUtils.addOutputData(retrievedId, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to normalization's problems.", null, true, "true", "false", "false", "false", "false", null, "null", "N/A");
UrlUtils.addOutputData(retrievedId, sourceUrl, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to normalization's problems.", null, true, "true", "false", "false", "false", "false", null, "null", "N/A");
LoaderAndChecker.connProblematicUrls.incrementAndGet();
return false;
}
Expand All @@ -375,11 +390,18 @@ public static void loadAndCheckEachIdUrlPairInEntries() throws RuntimeException
try { // Check if it's a docUrl, if not, it gets crawled.
HttpConnUtils.connectAndCheckMimeType(retrievedId, sourceUrl, urlToCheck, urlToCheck, null, true, isPossibleDocOrDatasetUrl);
} catch (Exception e) {

if ( e instanceof RuntimeException ) {
String msg = e.getMessage();
if ( (msg != null) && msg.contains(alreadyLoggedMessage) )
return false; // The error has already been logged in better detail.
}

List<String> list = getWasValidAndCouldRetry(e, urlToCheck);
String wasUrlValid = list.get(0);
String couldRetry = list.get(1);
String errorMsg = "Discarded at loading time, as " + list.get(2);
UrlUtils.addOutputData(retrievedId, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null", "N/A");
UrlUtils.addOutputData(retrievedId, urlToCheck, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null", "N/A");
return false;
}
return true;
Expand Down Expand Up @@ -433,7 +455,7 @@ public static void loadAndCheckEachIdUrlPair() throws RuntimeException
String sourceUrl = urlToCheck; // Hold it here for the logging-messages.
if ( (urlToCheck = basicURLNormalizer.filter(sourceUrl)) == null ) {
logger.warn("Could not normalize url: " + sourceUrl);
UrlUtils.addOutputData(retrievedId, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to normalization's problems.", null, true, "true", "false", "false", "false", "false", null, "null", "N/A");
UrlUtils.addOutputData(retrievedId, sourceUrl, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to normalization's problems.", null, true, "true", "false", "false", "false", "false", null, "null", "N/A");
LoaderAndChecker.connProblematicUrls.incrementAndGet();
continue;
}
Expand All @@ -455,11 +477,18 @@ public static void loadAndCheckEachIdUrlPair() throws RuntimeException
try { // Check if it's a docUrl, if not, it gets crawled.
HttpConnUtils.connectAndCheckMimeType(retrievedId, sourceUrl, urlToCheck, urlToCheck, null, true, isPossibleDocOrDatasetUrl);
} catch (Exception e) {

if ( e instanceof RuntimeException ) {
String msg = e.getMessage();
if ( (msg != null) && msg.contains(alreadyLoggedMessage) )
return false; // The error has already been logged in better detail.
}

List<String> list = getWasValidAndCouldRetry(e, urlToCheck);
String wasUrlValid = list.get(0);
String couldRetry = list.get(1);
String errorMsg = "Discarded at loading time, as " + list.get(2);
UrlUtils.addOutputData(retrievedId, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null", "N/A");
UrlUtils.addOutputData(retrievedId, urlToCheck, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null", "N/A");
return false;
}
}
Expand Down Expand Up @@ -549,11 +578,18 @@ private static boolean checkRemainingUrls(String retrievedId, Set<String> retrie
loggedUrlsOfThisId.add(urlToCheck);
return true; // A url was checked and didn't have any problems, return and log the remaining urls.
} catch (Exception e) {

if ( e instanceof RuntimeException ) {
String msg = e.getMessage();
if ( (msg != null) && msg.contains(alreadyLoggedMessage) )
return false; // The error has already been logged in better detail.
}

List<String> list = getWasValidAndCouldRetry(e, urlToCheck);
String wasUrlValid = list.get(0);
String couldRetry = list.get(1);
String errorMsg = "Discarded at loading time, in checkRemainingUrls(), as " + list.get(2);
UrlUtils.addOutputData(retrievedId, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null", "N/A");
UrlUtils.addOutputData(retrievedId, urlToCheck, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null", "N/A");
if ( !isSingleIdUrlPair )
loggedUrlsOfThisId.add(urlToCheck);
// Try the next url..
Expand All @@ -575,23 +611,23 @@ public static String handleUrlChecks(String urlId, String retrievedUrl)
String urlDomain = UrlUtils.getDomainStr(retrievedUrl, null);
if ( urlDomain == null ) { // If the domain is not found, it means that a serious problem exists with this docPage, and we shouldn't crawl it.
// The reason is already logged.
UrlUtils.addOutputData(urlId, retrievedUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'LoaderAndChecker.handleUrlChecks()' method, after the occurrence of a domain-retrieval error.", null, true, "true", "false", "false", "false", "false", null, "null", "N/A");
UrlUtils.addOutputData(urlId, retrievedUrl, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'LoaderAndChecker.handleUrlChecks()' method, after the occurrence of a domain-retrieval error.", null, true, "true", "false", "false", "false", "false", null, "null", "N/A");
if ( !useIdUrlPairs )
connProblematicUrls.incrementAndGet();
return null;
}

if ( HttpConnUtils.blacklistedDomains.contains(urlDomain) ) { // Check if it has been blacklisted after running internal links' checks.
logger.debug("Avoid connecting to blacklisted domain: \"" + urlDomain + "\" with url: " + retrievedUrl);
UrlUtils.addOutputData(urlId, retrievedUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'LoaderAndChecker.handleUrlChecks()' method, as its domain was found blacklisted.", null, true, "true", "true", "false", "false", "false", null, "null", "N/A");
UrlUtils.addOutputData(urlId, retrievedUrl, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'LoaderAndChecker.handleUrlChecks()' method, as its domain was found blacklisted.", null, true, "true", "true", "false", "false", "false", null, "null", "N/A");
if ( !useIdUrlPairs )
connProblematicUrls.incrementAndGet();
return null;
}

if ( ConnSupportUtils.checkIfPathIs403BlackListed(retrievedUrl, urlDomain) ) { // The path-extraction is independent of the jsessionid-removal, so this gets executed before.
logger.debug("Preventing reaching 403ErrorCode with url: \"" + retrievedUrl + "\"!");
UrlUtils.addOutputData(urlId, retrievedUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'LoaderAndChecker.handleUrlChecks()' as it had a blacklisted urlPath.", null, true, "true", "true", "false", "false", "false", null, "null", "N/A");
UrlUtils.addOutputData(urlId, retrievedUrl, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'LoaderAndChecker.handleUrlChecks()' as it had a blacklisted urlPath.", null, true, "true", "true", "false", "false", "false", null, "null", "N/A");
if ( !useIdUrlPairs )
connProblematicUrls.incrementAndGet();
return null;
Expand All @@ -609,7 +645,7 @@ public static String handleUrlChecks(String urlId, String retrievedUrl)
// Check if it's a duplicate.
if ( UrlUtils.duplicateUrls.contains(retrievedUrl) ) {
logger.debug("Skipping non-DocOrDataset-url: \"" + retrievedUrl + "\", at loading, as it has already been checked.");
UrlUtils.addOutputData(urlId, retrievedUrl, null, UrlUtils.duplicateUrlIndicator, "Discarded in 'LoaderAndChecker.handleUrlChecks()', as it's a duplicate.", null, false, "true", "N/A", "N/A", "N/A", "true", null, "null", "N/A");
UrlUtils.addOutputData(urlId, retrievedUrl, "N/A", UrlUtils.duplicateUrlIndicator, "Discarded in 'LoaderAndChecker.handleUrlChecks()', as it's a duplicate.", null, false, "true", "N/A", "N/A", "N/A", "true", null, "null", "N/A");
if ( !useIdUrlPairs )
inputDuplicatesNum.incrementAndGet();
return null;
Expand Down Expand Up @@ -673,7 +709,7 @@ private static void handleLogOfRemainingUrls(String retrievedId, Set<String> ret
retrievedUrl = tempUrl; // Make sure we check the non-normalized version.

if ( !loggedUrlsOfThisId.contains(retrievedUrl) )
UrlUtils.addOutputData(retrievedId, retrievedUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator,
UrlUtils.addOutputData(retrievedId, retrievedUrl, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator,
"Skipped in LoaderAndChecker, as a better url was selected for id: " + retrievedId, null, true, "false", "N/A", "N/A", "N/A", "true", null, "null", "N/A");
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,20 @@ public void testCustomInputOutputWithDatasets()
}


@Disabled
@Test
public void testCustomInputOutputWithDocsAndDatasets()
{
String[] args = new String[4];
args[0] = "-retrieveDataType";
args[1] = "all"; // "document" OR "dataset" OR "all"
args[2] = "-inputFileFullPath";
//args[3] = "./testData/idUrlPairs/test.json";
args[3] = "./testData/idUrlPairs/ukrn_data_test.json";
//args[3] = "./testData/idUrlPairs/docsAndDatasets.json";
main(args);
}


@Disabled
@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -835,7 +835,7 @@ public void checkUrlConnectivity()
HttpConnUtils.connectAndCheckMimeType(testID, urlToCheck, urlToCheck, urlToCheck, null, true, false); // Sent the < null > in quotes to avoid an NPE in the concurrent data-structures.
} catch (Exception e) {
// The problem was logged inside.
UrlUtils.addOutputData(testID, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to connectivity problems.", null, true, "true", "true", "false", "false", "false", null, "null", "N/A");
UrlUtils.addOutputData(testID, urlToCheck, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to connectivity problems.", null, true, "true", "true", "false", "false", "false", null, "null", "N/A");
}
}

Expand Down

0 comments on commit 76d2889

Please sign in to comment.