Skip to content

Commit

Permalink
Optimize "ConnSupportUtils.handleReCrossedDocUrl()".
Browse files Browse the repository at this point in the history
  • Loading branch information
LSmyrnaios committed Jan 8, 2025
1 parent 43afd67 commit 6d03bbe
Show file tree
Hide file tree
Showing 8 changed files with 43 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import com.google.common.collect.Multimaps;
import com.google.common.collect.SetMultimap;
import eu.openaire.publications_retriever.PublicationsRetriever;
import eu.openaire.publications_retriever.models.IdUrlMimeTypeTriple;
import eu.openaire.publications_retriever.util.file.FileUtils;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import eu.openaire.publications_retriever.util.http.HttpConnUtils;
Expand Down Expand Up @@ -316,9 +317,10 @@ else if ( UrlTypeChecker.PLAIN_PAGE_EXTENSION_FILTER.matcher(docIdStr.toLowerCas
logger.debug("Found a \"predictedDocUrl\" which exists in the \"currentPageLinks\": " + predictedDocUrl); // DEBUG!

// Check if the "predictedDocUrl" has been found before, but only if it exists in the set of this page's internal-links, as we may end up with a "docUrl" which is not related with this pageUrl.
if ( UrlUtils.docOrDatasetUrlsWithIDs.containsKey(predictedDocUrl) ) { // If we got into an already-found docUrl, log it and return true.
IdUrlMimeTypeTriple originalIdUrlMimeTypeTriple = UrlUtils.docOrDatasetUrlsWithIDs.get(predictedDocUrl);
if ( originalIdUrlMimeTypeTriple != null ) { // If we got into an already-found docUrl, log it and return true.
logger.info("MachineLearningAlgorithm got a hit for pageUrl: \""+ pageUrl + "\"! Resulted (already found before) docUrl was: \"" + predictedDocUrl + "\"" ); // DEBUG!
ConnSupportUtils.handleReCrossedDocUrl(urlId, sourceUrl, pageUrl, predictedDocUrl, false);
ConnSupportUtils.handleReCrossedDocUrl(urlId, sourceUrl, pageUrl, predictedDocUrl, originalIdUrlMimeTypeTriple, false);
MachineLearning.docUrlsFoundByMLA.incrementAndGet();
return true;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package eu.openaire.publications_retriever.crawler;

import eu.openaire.publications_retriever.exceptions.DomainBlockedException;
import eu.openaire.publications_retriever.models.IdUrlMimeTypeTriple;
import eu.openaire.publications_retriever.util.args.ArgsUtils;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import eu.openaire.publications_retriever.util.http.HttpConnUtils;
Expand Down Expand Up @@ -166,8 +167,9 @@ public static boolean checkAndHandleMetadata(String urlId, String sourceUrl, Str
// For example: http://localhost:4000/bitstreams/98e649e7-a656-4a90-ad69-534178e63fbb/download
metaDocUrl = LOCALHOST_DOMAIN_REPLACEMENT_PATTERN.matcher(metaDocUrl).replaceFirst("://" + pageDomain);

if ( UrlUtils.docOrDatasetUrlsWithIDs.containsKey(metaDocUrl) ) { // If we got into an already-found docUrl, log it and return.
ConnSupportUtils.handleReCrossedDocUrl(urlId, sourceUrl, pageUrl, metaDocUrl, false);
IdUrlMimeTypeTriple originalIdUrlMimeTypeTriple = UrlUtils.docOrDatasetUrlsWithIDs.get(metaDocUrl);
if ( originalIdUrlMimeTypeTriple != null ) { // If we got into an already-found docUrl, log it and return.
ConnSupportUtils.handleReCrossedDocUrl(urlId, sourceUrl, pageUrl, metaDocUrl, originalIdUrlMimeTypeTriple, false);
numOfMetaDocUrlsFound.incrementAndGet();
return true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -160,10 +160,10 @@ else if ( firstHTMLlineFromDetectedContentType != null ) {
} else
urlToCheck = currentLink;

IdUrlMimeTypeTriple idUrlMimeTypeTriple = UrlUtils.docOrDatasetUrlsWithIDs.get(urlToCheck); // If we got into an already-found docUrl, log it and return.
if ( idUrlMimeTypeTriple != null ) { // If we got into an already-found docUrl, log it and return.
ConnSupportUtils.handleReCrossedDocUrl(urlId, sourceUrl, pageUrl, urlToCheck, false);
return;
IdUrlMimeTypeTriple originalIdUrlMimeTypeTriple = UrlUtils.docOrDatasetUrlsWithIDs.get(urlToCheck); // If we got into an already-found docUrl, log it and return.
if ( originalIdUrlMimeTypeTriple != null ) { // If we got into an already-found docUrl, log it and return.
ConnSupportUtils.handleReCrossedDocUrl(urlId, sourceUrl, pageUrl, urlToCheck, originalIdUrlMimeTypeTriple, false);
return;
}

lowerCaseLink = urlToCheck.toLowerCase();
Expand Down Expand Up @@ -593,9 +593,9 @@ public static boolean verifyDocLink(String urlId, String sourceUrl, String pageU
return false;
}

IdUrlMimeTypeTriple idUrlMimeTypeTriple = UrlUtils.docOrDatasetUrlsWithIDs.get(docLink);
if ( idUrlMimeTypeTriple != null ) { // If we got into an already-found docUrl, log it and return.
ConnSupportUtils.handleReCrossedDocUrl(urlId, sourceUrl, pageUrl, docLink, false);
IdUrlMimeTypeTriple originalIdUrlMimeTypeTriple = UrlUtils.docOrDatasetUrlsWithIDs.get(docLink);
if ( originalIdUrlMimeTypeTriple != null ) { // If we got into an already-found docUrl, log it and return.
ConnSupportUtils.handleReCrossedDocUrl(urlId, sourceUrl, pageUrl, docLink, originalIdUrlMimeTypeTriple, false);
return true;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import eu.openaire.publications_retriever.exceptions.DocLinkFoundException;
import eu.openaire.publications_retriever.exceptions.DocLinkUnavailableException;
import eu.openaire.publications_retriever.models.IdUrlMimeTypeTriple;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import eu.openaire.publications_retriever.util.http.HttpConnUtils;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
Expand Down Expand Up @@ -224,8 +225,9 @@ public static boolean extractAndCheckTurkjgastroenterolDocUrl(String pageHtml, S
if ( (urlToCheck = LoaderAndChecker.handleUrlChecks(urlId, urlToCheck)) == null )
return false; // The output-data was logged inside.

if ( UrlUtils.docOrDatasetUrlsWithIDs.containsKey(urlToCheck) ) { // If we got into an already-found docUrl, log it and return.
ConnSupportUtils.handleReCrossedDocUrl(urlId, urlToCheck, urlToCheck, urlToCheck, true); // The output-data was logged inside.
IdUrlMimeTypeTriple originalIdUrlMimeTypeTriple = UrlUtils.docOrDatasetUrlsWithIDs.get(urlToCheck);
if ( originalIdUrlMimeTypeTriple != null ) { // If we got into an already-found docUrl, log it and return.
ConnSupportUtils.handleReCrossedDocUrl(urlId, urlToCheck, urlToCheck, urlToCheck, originalIdUrlMimeTypeTriple, true); // The output-data was logged inside.
return false;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -349,13 +349,12 @@ else if ( (contentDisposition != null) && !contentDisposition.equals("attachment
}


public static void handleReCrossedDocUrl(String urlId, String sourceUrl, String pageUrl, String docUrl, boolean calledForPageUrl) {
public static void handleReCrossedDocUrl(String urlId, String sourceUrl, String pageUrl, String docUrl, IdUrlMimeTypeTriple originalIdUrlMimeTypeTriple, boolean calledForPageUrl) {
logger.info("re-crossed docUrl found: < " + docUrl + " >");
reCrossedDocUrls.incrementAndGet();
String wasDirectLink = ConnSupportUtils.getWasDirectLink(sourceUrl, pageUrl, calledForPageUrl, docUrl);
IdUrlMimeTypeTriple idUrlMimeTypeTriple = UrlUtils.docOrDatasetUrlsWithIDs.get(docUrl); // It's guaranteed to not be null at this point.
String comment = ((ArgsUtils.shouldDownloadDocFiles ? (alreadyDownloadedFromIDMessage + idUrlMimeTypeTriple.id + alreadyDownloadedFromSourceUrlContinuedMessage) : (alreadyDetectedFromIDMessage + idUrlMimeTypeTriple.id + alreadyDetectedFromSourceUrlContinuedMessage)) + idUrlMimeTypeTriple.url);
UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, docUrl, comment, null, false, "true", "true", "true", wasDirectLink, "true", null, "null", idUrlMimeTypeTriple.mimeType);
String comment = ((ArgsUtils.shouldDownloadDocFiles ? (alreadyDownloadedFromIDMessage + originalIdUrlMimeTypeTriple.id + alreadyDownloadedFromSourceUrlContinuedMessage) : (alreadyDetectedFromIDMessage + originalIdUrlMimeTypeTriple.id + alreadyDetectedFromSourceUrlContinuedMessage)) + originalIdUrlMimeTypeTriple.url);
UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, docUrl, comment, null, false, "true", "true", "true", wasDirectLink, "true", null, "null", originalIdUrlMimeTypeTriple.mimeType);
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import eu.openaire.publications_retriever.crawler.PageCrawler;
import eu.openaire.publications_retriever.crawler.SpecialUrlsHandler;
import eu.openaire.publications_retriever.exceptions.*;
import eu.openaire.publications_retriever.models.IdUrlMimeTypeTriple;
import eu.openaire.publications_retriever.models.MimeTypeResult;
import eu.openaire.publications_retriever.util.args.ArgsUtils;
import eu.openaire.publications_retriever.util.file.FileData;
Expand Down Expand Up @@ -554,8 +555,9 @@ else if ( lowerCaseTargetUrl.contains("sharedsitesession") ) { // either "getSha

//ConnSupportUtils.printRedirectDebugInfo(currentUrl, location, targetUrl, responseCode, curRedirectsNum);

if ( UrlUtils.docOrDatasetUrlsWithIDs.containsKey(targetUrl) ) { // If we got into an already-found docUrl, log it and return.
ConnSupportUtils.handleReCrossedDocUrl(urlId, sourceUrl, pageUrl, targetUrl, calledForPageUrl);
IdUrlMimeTypeTriple originalIdUrlMimeTypeTriple = UrlUtils.docOrDatasetUrlsWithIDs.get(targetUrl);
if ( originalIdUrlMimeTypeTriple != null ) { // If we got into an already-found docUrl, log it and return.
ConnSupportUtils.handleReCrossedDocUrl(urlId, sourceUrl, pageUrl, targetUrl, originalIdUrlMimeTypeTriple, calledForPageUrl);
throw new AlreadyFoundDocUrlException();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import eu.openaire.publications_retriever.exceptions.ConnTimeoutException;
import eu.openaire.publications_retriever.exceptions.DomainBlockedException;
import eu.openaire.publications_retriever.exceptions.DomainWithUnsupportedHEADmethodException;
import eu.openaire.publications_retriever.models.IdUrlMimeTypeTriple;
import eu.openaire.publications_retriever.util.args.ArgsUtils;
import eu.openaire.publications_retriever.util.file.FileUtils;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
Expand Down Expand Up @@ -129,8 +130,9 @@ public static void loadAndCheckUrls() throws RuntimeException
return false;
}

if ( UrlUtils.docOrDatasetUrlsWithIDs.containsKey(retrievedUrl) ) { // If we got into an already-found docUrl, log it and return.
ConnSupportUtils.handleReCrossedDocUrl("null", retrievedUrl, retrievedUrl, retrievedUrl, true);
IdUrlMimeTypeTriple originalIdUrlMimeTypeTriple = UrlUtils.docOrDatasetUrlsWithIDs.get(retrievedUrl);
if ( originalIdUrlMimeTypeTriple != null ) { // If we got into an already-found docUrl, log it and return.
ConnSupportUtils.handleReCrossedDocUrl("null", retrievedUrl, retrievedUrl, retrievedUrl, originalIdUrlMimeTypeTriple, true);
return true;
}

Expand Down Expand Up @@ -221,8 +223,9 @@ public static void loadAndCheckIdUrlPairs() throws RuntimeException
continue;
} // The "retrievedUrl" might have changed (inside "handleUrlChecks()").

if ( UrlUtils.docOrDatasetUrlsWithIDs.containsKey(retrievedUrl) ) { // If we got into an already-found docUrl, log it and return.
ConnSupportUtils.handleReCrossedDocUrl(retrievedId, retrievedUrl, retrievedUrl, retrievedUrl, true);
IdUrlMimeTypeTriple originalIdUrlMimeTypeTriple = UrlUtils.docOrDatasetUrlsWithIDs.get(retrievedUrl);
if ( originalIdUrlMimeTypeTriple != null ) { // If we got into an already-found docUrl, log it and return.
ConnSupportUtils.handleReCrossedDocUrl(retrievedId, retrievedUrl, retrievedUrl, retrievedUrl, originalIdUrlMimeTypeTriple, true);
if ( !isSingleIdUrlPair )
loggedUrlsOfCurrentId.add(retrievedUrl);
goToNextId = true; // Skip the best-url evaluation & connection after this loop.
Expand Down Expand Up @@ -373,8 +376,9 @@ public static void loadAndCheckEachIdUrlPairInEntries() throws RuntimeException
return false;
}

if ( UrlUtils.docOrDatasetUrlsWithIDs.containsKey(retrievedUrl) ) { // If we got into an already-found docUrl, log it and return.
ConnSupportUtils.handleReCrossedDocUrl(retrievedId, retrievedUrl, retrievedUrl, retrievedUrl, true);
IdUrlMimeTypeTriple originalIdUrlMimeTypeTriple = UrlUtils.docOrDatasetUrlsWithIDs.get(retrievedUrl);
if ( originalIdUrlMimeTypeTriple != null ) { // If we got into an already-found docUrl, log it and return.
ConnSupportUtils.handleReCrossedDocUrl(retrievedId, retrievedUrl, retrievedUrl, retrievedUrl, originalIdUrlMimeTypeTriple, true);
return true;
}

Expand Down Expand Up @@ -460,8 +464,9 @@ public static void loadAndCheckEachIdUrlPair() throws RuntimeException
continue;
}

if ( UrlUtils.docOrDatasetUrlsWithIDs.containsKey(retrievedUrl) ) { // If we got into an already-found docUrl, log it and return.
ConnSupportUtils.handleReCrossedDocUrl(retrievedId, retrievedUrl, retrievedUrl, retrievedUrl, true);
IdUrlMimeTypeTriple originalIdUrlMimeTypeTriple = UrlUtils.docOrDatasetUrlsWithIDs.get(retrievedUrl);
if ( originalIdUrlMimeTypeTriple != null ) { // If we got into an already-found docUrl, log it and return.
ConnSupportUtils.handleReCrossedDocUrl(retrievedId, retrievedUrl, retrievedUrl, retrievedUrl, originalIdUrlMimeTypeTriple, true);
continue;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import com.google.common.collect.HashMultimap;
import eu.openaire.publications_retriever.PublicationsRetriever;
import eu.openaire.publications_retriever.crawler.PageCrawler;
import eu.openaire.publications_retriever.models.IdUrlMimeTypeTriple;
import eu.openaire.publications_retriever.util.args.ArgsUtils;
import eu.openaire.publications_retriever.util.file.FileUtils;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
Expand Down Expand Up @@ -826,8 +827,9 @@ public void checkUrlConnectivity()
else
logger.debug("urlPath: " + urlPath);*/

if ( UrlUtils.docOrDatasetUrlsWithIDs.containsKey(urlToCheck) ) { // If we got into an already-found docUrl, log it and return.
ConnSupportUtils.handleReCrossedDocUrl(testID, urlToCheck, urlToCheck, urlToCheck, true);
IdUrlMimeTypeTriple originalIdUrlMimeTypeTriple = UrlUtils.docOrDatasetUrlsWithIDs.get(urlToCheck);
if ( originalIdUrlMimeTypeTriple != null ) { // If we got into an already-found docUrl, log it and return.
ConnSupportUtils.handleReCrossedDocUrl(testID, urlToCheck, urlToCheck, urlToCheck, originalIdUrlMimeTypeTriple, true);
continue;
}

Expand Down

0 comments on commit 6d03bbe

Please sign in to comment.