Skip to content

Commit

Permalink
Keep the mimetypes of first-crossed docOrDatasetUrls, in memory, in o…
Browse files Browse the repository at this point in the history
…rder to output them for "already-retrieved" records.
  • Loading branch information
LSmyrnaios committed Dec 19, 2024
1 parent 76d2889 commit f2dfbba
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 13 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package eu.openaire.publications_retriever.crawler;

import eu.openaire.publications_retriever.exceptions.*;
import eu.openaire.publications_retriever.models.IdUrlMimeTypeTriple;
import eu.openaire.publications_retriever.util.args.ArgsUtils;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import eu.openaire.publications_retriever.util.http.HttpConnUtils;
Expand Down Expand Up @@ -159,7 +160,8 @@ else if ( firstHTMLlineFromDetectedContentType != null ) {
} else
urlToCheck = currentLink;

if ( UrlUtils.docOrDatasetUrlsWithIDs.containsKey(urlToCheck) ) { // If we got into an already-found docUrl, log it and return.
IdUrlMimeTypeTriple idUrlMimeTypeTriple = UrlUtils.docOrDatasetUrlsWithIDs.get(urlToCheck); // If we got into an already-found docUrl, log it and return.
if ( idUrlMimeTypeTriple != null ) { // If we got into an already-found docUrl, log it and return.
ConnSupportUtils.handleReCrossedDocUrl(urlId, sourceUrl, pageUrl, urlToCheck, false);
return;
}
Expand Down Expand Up @@ -591,7 +593,8 @@ public static boolean verifyDocLink(String urlId, String sourceUrl, String pageU
return false;
}

if ( UrlUtils.docOrDatasetUrlsWithIDs.containsKey(docLink) ) { // If we got into an already-found docUrl, log it and return.
IdUrlMimeTypeTriple idUrlMimeTypeTriple = UrlUtils.docOrDatasetUrlsWithIDs.get(docLink);
if ( idUrlMimeTypeTriple != null ) { // If we got into an already-found docUrl, log it and return.
ConnSupportUtils.handleReCrossedDocUrl(urlId, sourceUrl, pageUrl, docLink, false);
return true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@
import eu.openaire.publications_retriever.exceptions.DocLinkFoundException;
import eu.openaire.publications_retriever.exceptions.DomainBlockedException;
import eu.openaire.publications_retriever.exceptions.FileNotRetrievedException;
import eu.openaire.publications_retriever.models.IdUrlMimeTypeTriple;
import eu.openaire.publications_retriever.models.MimeTypeResult;
import eu.openaire.publications_retriever.util.args.ArgsUtils;
import eu.openaire.publications_retriever.util.file.FileData;
import eu.openaire.publications_retriever.util.file.FileUtils;
import eu.openaire.publications_retriever.util.file.IdUrlTuple;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
import eu.openaire.publications_retriever.util.url.UrlUtils;
import org.apache.commons.compress.compressors.brotli.BrotliCompressorInputStream;
Expand Down Expand Up @@ -82,9 +82,13 @@ public class ConnSupportUtils
private static final int timesToReturnNoTypeBeforeDomainBlocked = 10;
public static AtomicInteger reCrossedDocUrls = new AtomicInteger(0);

public static final String alreadyDownloadedFromIDMessage = "This file is probably already downloaded from ID=";
public static final String alreadyDownloadedFromIDMessage = "This file is probably already downloaded by ID=";
public static final String alreadyDownloadedFromSourceUrlContinuedMessage = " and SourceUrl=";

public static final String alreadyDetectedFromIDMessage = "This url was already detected by ID=";
public static final String alreadyDetectedFromSourceUrlContinuedMessage = " and SourceUrl=";


public static final Set<String> knownDocMimeTypes = Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
public static final Set<String> knownDatasetMimeTypes = Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>());

Expand Down Expand Up @@ -349,11 +353,9 @@ public static void handleReCrossedDocUrl(String urlId, String sourceUrl, String
logger.info("re-crossed docUrl found: < " + docUrl + " >");
reCrossedDocUrls.incrementAndGet();
String wasDirectLink = ConnSupportUtils.getWasDirectLink(sourceUrl, pageUrl, calledForPageUrl, docUrl);
if ( ArgsUtils.shouldDownloadDocFiles ) {
IdUrlTuple idUrlTuple = UrlUtils.docOrDatasetUrlsWithIDs.get(docUrl);
UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, docUrl, alreadyDownloadedFromIDMessage + idUrlTuple.id + alreadyDownloadedFromSourceUrlContinuedMessage + idUrlTuple.url, null, false, "true", "true", "true", wasDirectLink, "true", null, "null", "N/A");
} else
UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, docUrl, "", null, false, "true", "true", "true", wasDirectLink, "true", null, "null", "N/A");
IdUrlMimeTypeTriple idUrlMimeTypeTriple = UrlUtils.docOrDatasetUrlsWithIDs.get(docUrl); // It's guaranteed to not be null at this point.
String comment = ((ArgsUtils.shouldDownloadDocFiles ? (alreadyDownloadedFromIDMessage + idUrlMimeTypeTriple.id + alreadyDownloadedFromSourceUrlContinuedMessage) : (alreadyDetectedFromIDMessage + idUrlMimeTypeTriple.id + alreadyDetectedFromSourceUrlContinuedMessage)) + idUrlMimeTypeTriple.url);
UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, docUrl, comment, null, false, "true", "true", "true", wasDirectLink, "true", null, "null", idUrlMimeTypeTriple.mimeType);
}


Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
package eu.openaire.publications_retriever.util.url;

import eu.openaire.publications_retriever.crawler.MachineLearning;
import eu.openaire.publications_retriever.models.IdUrlMimeTypeTriple;
import eu.openaire.publications_retriever.util.file.FileUtils;
import eu.openaire.publications_retriever.util.file.IdUrlTuple;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
Expand Down Expand Up @@ -38,7 +38,7 @@ public class UrlUtils

public static final Set<String> duplicateUrls = Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>());

public static final ConcurrentHashMap<String, IdUrlTuple> docOrDatasetUrlsWithIDs = new ConcurrentHashMap<String, IdUrlTuple>(); // Null keys are allowed (in case they are not available in the input).
public static final ConcurrentHashMap<String, IdUrlMimeTypeTriple> docOrDatasetUrlsWithIDs = new ConcurrentHashMap<>(); // Null keys are allowed (in case they are not available in the input).

public static final ConcurrentHashMap<String, Integer> domainsAndHits = new ConcurrentHashMap<>();
// The data inside ConcurrentHashMap "domainsAndHits" is used to evaluate how good the domain is doing while is having some problems.
Expand Down Expand Up @@ -84,7 +84,7 @@ public static void addOutputData(String urlId, String sourceUrl, String pageUrl,
finalDocOrDatasetUrl = UrlUtils.removeTemporalIdentifier(finalDocOrDatasetUrl); // We send the non-lowerCase-url as we may want to continue with that docOrDatasetUrl in case of an error.

if ( isFirstCrossed ) // Add this id, only if this is a first-crossed docOrDatasetUrl.
docOrDatasetUrlsWithIDs.put(finalDocOrDatasetUrl, new IdUrlTuple(urlId, sourceUrl)); // Add it here, in order to be able to recognize it and quick-log it later, but also to distinguish it from other duplicates.
docOrDatasetUrlsWithIDs.put(finalDocOrDatasetUrl, new IdUrlMimeTypeTriple(urlId, sourceUrl, mimeType)); // Add it here, in order to be able to recognize it and quick-log it later, but also to distinguish it from other duplicates.

if ( pageDomain == null )
pageDomain = UrlUtils.getDomainStr(pageUrl, null);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -786,8 +786,8 @@ public void checkUrlConnectivity()
for ( String url: urlList )
logger.info(url);

LoaderAndChecker.retrieveDocuments = true;
LoaderAndChecker.retrieveDatasets = true;
LoaderAndChecker.retrieveDocuments = false;

// Set some needed data.
ConnSupportUtils.setKnownMimeTypes();
Expand Down

0 comments on commit f2dfbba

Please sign in to comment.