Skip to content

Commit

Permalink
Improve naming for "metadata" handler.
Browse files Browse the repository at this point in the history
  • Loading branch information
LSmyrnaios committed Feb 12, 2024
1 parent f95123d commit fd3c79b
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 8 deletions.
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package eu.openaire.publications_retriever;

import eu.openaire.publications_retriever.crawler.MachineLearning;
import eu.openaire.publications_retriever.crawler.MetaDocUrlsHandler;
import eu.openaire.publications_retriever.crawler.MetadataHandler;
import eu.openaire.publications_retriever.crawler.PageCrawler;
import eu.openaire.publications_retriever.util.file.FileUtils;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
Expand Down Expand Up @@ -379,15 +379,15 @@ public static void showStatistics(Instant startTime)
logger.info("From which docUrls, we were able to retrieve: " + numOfStoredDocFiles + " distinct docFiles. That's about: " + df.format(numOfStoredDocFiles * 100.0 / UrlUtils.sumOfDocUrlsFound.get()) + "%."
+ " The un-retrieved docFiles were either belonging to already-found docUrls or they had connection-issues.");
}
logger.debug("The metaDocUrl-handler is responsible for the discovery of " + MetaDocUrlsHandler.numOfMetaDocUrlsFound + " docUrls (" + df.format(MetaDocUrlsHandler.numOfMetaDocUrlsFound.get() * 100.0 / UrlUtils.sumOfDocUrlsFound.get()) + "% of the found docUrls).");
logger.debug("The metaDocUrl-handler is responsible for the discovery of " + MetadataHandler.numOfMetaDocUrlsFound + " docUrls (" + df.format(MetadataHandler.numOfMetaDocUrlsFound.get() * 100.0 / UrlUtils.sumOfDocUrlsFound.get()) + "% of the found docUrls).");
logger.debug("The re-crossed docUrls (from all handlers) were " + ConnSupportUtils.reCrossedDocUrls.get() + ". That's about " + df.format(ConnSupportUtils.reCrossedDocUrls.get() * 100.0 / UrlUtils.sumOfDocUrlsFound.get()) + "% of the found docUrls.");
if ( MachineLearning.useMLA )
logger.debug("The M.L.A. is responsible for the discovery of " + MachineLearning.docUrlsFoundByMLA.get() + " of the docUrls (" + df.format(MachineLearning.docUrlsFoundByMLA.get() * 100.0 / UrlUtils.sumOfDocUrlsFound.get()) + "%). The M.L.A.'s average success-rate was: " + df.format(MachineLearning.getAverageSuccessRate()) + "%. Gathered data for " + MachineLearning.timesGatheredData.get() + " valid pageUrl-docUrl pairs.");
else
logger.debug("The M.L.A. was not enabled.");

logger.debug("About " + df.format(LoaderAndChecker.connProblematicUrls.get() * 100.0 / inputCheckedUrlNum) + "% (" + LoaderAndChecker.connProblematicUrls.get() + " urls) were pages which had connectivity problems.");
logger.debug("About " + df.format(MetaDocUrlsHandler.numOfProhibitedAccessPagesFound.get() * 100.0 / inputCheckedUrlNum) + "% (" + MetaDocUrlsHandler.numOfProhibitedAccessPagesFound.get() + " urls) were pages with prohibited access.");
logger.debug("About " + df.format(MetadataHandler.numOfProhibitedAccessPagesFound.get() * 100.0 / inputCheckedUrlNum) + "% (" + MetadataHandler.numOfProhibitedAccessPagesFound.get() + " urls) were pages with prohibited access.");
logger.debug("About " + df.format(UrlTypeChecker.pagesNotProvidingDocUrls.get() * 100.0 / inputCheckedUrlNum) + "% (" + UrlTypeChecker.pagesNotProvidingDocUrls.get() + " urls) were pages which did not provide docUrls.");
logger.debug("About " + df.format(UrlTypeChecker.longToRespondUrls.get() * 100.0 / inputCheckedUrlNum) + "% (" + UrlTypeChecker.longToRespondUrls.get() + " urls) were urls which belong to domains which take too long to respond.");
logger.debug("About " + df.format(PageCrawler.contentProblematicUrls.get() * 100.0 / inputCheckedUrlNum) + "% (" + PageCrawler.contentProblematicUrls.get() + " urls) were urls which had problematic content.");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class MetaDocUrlsHandler {
public class MetadataHandler {

private static final Logger logger = LoggerFactory.getLogger(MetaDocUrlsHandler.class);
private static final Logger logger = LoggerFactory.getLogger(MetadataHandler.class);


// Order-independent META_RESTRICTED_ACCESS_RIGHTS-regex.
Expand Down Expand Up @@ -72,7 +72,7 @@ else if ( !LoaderAndChecker.retrieveDocuments )
* @param pageHtml
* @return
*/
public static boolean checkIfAndHandleMetaDocUrl(String urlId, String sourceUrl, String pageUrl, String pageDomain, String pageHtml)
public static boolean checkAndHandleMetadata(String urlId, String sourceUrl, String pageUrl, String pageDomain, String pageHtml)
{
// Before checking for the MetaDocUrl, check whether this publication is restricted or not. It may have a metaDocUrl, but it will redirect to the landing page.
// e.g.: https://le.uwpress.org/content/78/2/260
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,8 @@ else if ( firstHTMLlineFromDetectedContentType != null ) {
return;
}

// Check if the docLink is provided in a metaTag and connect to it directly.
if ( MetaDocUrlsHandler.checkIfAndHandleMetaDocUrl(urlId, sourceUrl, pageUrl, pageDomain, pageHtml) )
// Check if this publication is (likely) open-access and then check the docLink is provided in a metaTag and connect to it directly.
if ( MetadataHandler.checkAndHandleMetadata(urlId, sourceUrl, pageUrl, pageDomain, pageHtml) )
return; // The sourceUrl is already logged inside the called method.

HashSet<String> currentPageLinks = null; // We use "HashSet" to avoid duplicates.
Expand Down

0 comments on commit fd3c79b

Please sign in to comment.