diff --git a/src/main/java/eu/openaire/publications_retriever/PublicationsRetriever.java b/src/main/java/eu/openaire/publications_retriever/PublicationsRetriever.java index cc597d0..1fc452f 100644 --- a/src/main/java/eu/openaire/publications_retriever/PublicationsRetriever.java +++ b/src/main/java/eu/openaire/publications_retriever/PublicationsRetriever.java @@ -1,7 +1,7 @@ package eu.openaire.publications_retriever; import eu.openaire.publications_retriever.crawler.MachineLearning; -import eu.openaire.publications_retriever.crawler.MetaDocUrlsHandler; +import eu.openaire.publications_retriever.crawler.MetadataHandler; import eu.openaire.publications_retriever.crawler.PageCrawler; import eu.openaire.publications_retriever.util.file.FileUtils; import eu.openaire.publications_retriever.util.http.ConnSupportUtils; @@ -379,7 +379,7 @@ public static void showStatistics(Instant startTime) logger.info("From which docUrls, we were able to retrieve: " + numOfStoredDocFiles + " distinct docFiles. That's about: " + df.format(numOfStoredDocFiles * 100.0 / UrlUtils.sumOfDocUrlsFound.get()) + "%." + " The un-retrieved docFiles were either belonging to already-found docUrls or they had connection-issues."); } - logger.debug("The metaDocUrl-handler is responsible for the discovery of " + MetaDocUrlsHandler.numOfMetaDocUrlsFound + " docUrls (" + df.format(MetaDocUrlsHandler.numOfMetaDocUrlsFound.get() * 100.0 / UrlUtils.sumOfDocUrlsFound.get()) + "% of the found docUrls)."); + logger.debug("The metaDocUrl-handler is responsible for the discovery of " + MetadataHandler.numOfMetaDocUrlsFound + " docUrls (" + df.format(MetadataHandler.numOfMetaDocUrlsFound.get() * 100.0 / UrlUtils.sumOfDocUrlsFound.get()) + "% of the found docUrls)."); logger.debug("The re-crossed docUrls (from all handlers) were " + ConnSupportUtils.reCrossedDocUrls.get() + ". That's about " + df.format(ConnSupportUtils.reCrossedDocUrls.get() * 100.0 / UrlUtils.sumOfDocUrlsFound.get()) + "% of the found docUrls."); if ( MachineLearning.useMLA ) logger.debug("The M.L.A. is responsible for the discovery of " + MachineLearning.docUrlsFoundByMLA.get() + " of the docUrls (" + df.format(MachineLearning.docUrlsFoundByMLA.get() * 100.0 / UrlUtils.sumOfDocUrlsFound.get()) + "%). The M.L.A.'s average success-rate was: " + df.format(MachineLearning.getAverageSuccessRate()) + "%. Gathered data for " + MachineLearning.timesGatheredData.get() + " valid pageUrl-docUrl pairs."); @@ -387,7 +387,7 @@ public static void showStatistics(Instant startTime) logger.debug("The M.L.A. was not enabled."); logger.debug("About " + df.format(LoaderAndChecker.connProblematicUrls.get() * 100.0 / inputCheckedUrlNum) + "% (" + LoaderAndChecker.connProblematicUrls.get() + " urls) were pages which had connectivity problems."); - logger.debug("About " + df.format(MetaDocUrlsHandler.numOfProhibitedAccessPagesFound.get() * 100.0 / inputCheckedUrlNum) + "% (" + MetaDocUrlsHandler.numOfProhibitedAccessPagesFound.get() + " urls) were pages with prohibited access."); + logger.debug("About " + df.format(MetadataHandler.numOfProhibitedAccessPagesFound.get() * 100.0 / inputCheckedUrlNum) + "% (" + MetadataHandler.numOfProhibitedAccessPagesFound.get() + " urls) were pages with prohibited access."); logger.debug("About " + df.format(UrlTypeChecker.pagesNotProvidingDocUrls.get() * 100.0 / inputCheckedUrlNum) + "% (" + UrlTypeChecker.pagesNotProvidingDocUrls.get() + " urls) were pages which did not provide docUrls."); logger.debug("About " + df.format(UrlTypeChecker.longToRespondUrls.get() * 100.0 / inputCheckedUrlNum) + "% (" + UrlTypeChecker.longToRespondUrls.get() + " urls) were urls which belong to domains which take too long to respond."); logger.debug("About " + df.format(PageCrawler.contentProblematicUrls.get() * 100.0 / inputCheckedUrlNum) + "% (" + PageCrawler.contentProblematicUrls.get() + " urls) were urls which had problematic content."); diff --git a/src/main/java/eu/openaire/publications_retriever/crawler/MetaDocUrlsHandler.java b/src/main/java/eu/openaire/publications_retriever/crawler/MetadataHandler.java similarity index 98% rename from src/main/java/eu/openaire/publications_retriever/crawler/MetaDocUrlsHandler.java rename to src/main/java/eu/openaire/publications_retriever/crawler/MetadataHandler.java index d3706a8..a4b8c0e 100644 --- a/src/main/java/eu/openaire/publications_retriever/crawler/MetaDocUrlsHandler.java +++ b/src/main/java/eu/openaire/publications_retriever/crawler/MetadataHandler.java @@ -13,9 +13,9 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -public class MetaDocUrlsHandler { +public class MetadataHandler { - private static final Logger logger = LoggerFactory.getLogger(MetaDocUrlsHandler.class); + private static final Logger logger = LoggerFactory.getLogger(MetadataHandler.class); // Order-independent META_RESTRICTED_ACCESS_RIGHTS-regex. @@ -72,7 +72,7 @@ else if ( !LoaderAndChecker.retrieveDocuments ) * @param pageHtml * @return */ - public static boolean checkIfAndHandleMetaDocUrl(String urlId, String sourceUrl, String pageUrl, String pageDomain, String pageHtml) + public static boolean checkAndHandleMetadata(String urlId, String sourceUrl, String pageUrl, String pageDomain, String pageHtml) { // Before checking for the MetaDocUrl, check whether this publication is restricted or not. It may have a metaDocUrl, but it will redirect to the landing page. // e.g.: https://le.uwpress.org/content/78/2/260 diff --git a/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java b/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java index 41ce50a..4c7013a 100644 --- a/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java +++ b/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java @@ -107,8 +107,8 @@ else if ( firstHTMLlineFromDetectedContentType != null ) { return; } - // Check if the docLink is provided in a metaTag and connect to it directly. - if ( MetaDocUrlsHandler.checkIfAndHandleMetaDocUrl(urlId, sourceUrl, pageUrl, pageDomain, pageHtml) ) + // Check if this publication is (likely) open-access and then check the docLink is provided in a metaTag and connect to it directly. + if ( MetadataHandler.checkAndHandleMetadata(urlId, sourceUrl, pageUrl, pageDomain, pageHtml) ) return; // The sourceUrl is already logged inside the called method. HashSet currentPageLinks = null; // We use "HashSet" to avoid duplicates.