diff --git a/src/main/java/eu/openaire/publications_retriever/crawler/MetadataHandler.java b/src/main/java/eu/openaire/publications_retriever/crawler/MetadataHandler.java index e2af309..872947e 100644 --- a/src/main/java/eu/openaire/publications_retriever/crawler/MetadataHandler.java +++ b/src/main/java/eu/openaire/publications_retriever/crawler/MetadataHandler.java @@ -44,12 +44,12 @@ public class MetadataHandler { String regex = ".+\\.(?:"; if ( !LoaderAndChecker.retrieveDatasets ) - regex += "zip|rar|"; // If no datasets retrieved, block these types. + regex += LoaderAndChecker.dataset_formats; // If no datasets retrieved, block these types. else if ( !LoaderAndChecker.retrieveDocuments ) - regex += "pdf|doc[x]?|"; // If no documents retrieved, block these types. + regex += "pdf|" + UrlTypeChecker.unsupportedDocFileTypes; // If no documents retrieved, block these types. //else -> no more datatype-dependent additions - regex += "apk|jpg|png)(?:\\?.+)?$"; + regex += "|apk|jpg|png)(?:\\?.+)?$"; logger.debug("COMMON_UNSUPPORTED_META_DOC_OR_DATASET_URL_EXTENSIONS -> REGEX: " + regex); COMMON_UNSUPPORTED_META_DOC_OR_DATASET_URL_EXTENSIONS = Pattern.compile(regex); } diff --git a/src/main/java/eu/openaire/publications_retriever/util/url/LoaderAndChecker.java b/src/main/java/eu/openaire/publications_retriever/util/url/LoaderAndChecker.java index ef1acb0..1087169 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/url/LoaderAndChecker.java +++ b/src/main/java/eu/openaire/publications_retriever/util/url/LoaderAndChecker.java @@ -40,9 +40,9 @@ public class LoaderAndChecker // "DOC_URL_FILTER" works for lowerCase Strings (we make sure they are in lowerCase before we check). // Note that we still need to check if it's an alive link and if it's actually a docUrl (though it's mimeType). - private static final String dataset_formats = "xls[xbm]?|xlt[x]?|[ct]sv|tab|(?:(?:geo)?j|b)son|(?:x|k|g|nmr|sb|wiley|y[a]?)ml|xsd|o[dt]s|ddi|rdf|[g]?zip|zipx|[rt]ar|[7x]z|[t]?gz|[gb]z[\\d]*|smi[l]?|por|ascii|dta|sav|dat|txt|ti[f]{1,2}|tfw|dwg|nt|fits|feather" + public static final String dataset_formats = "(?:xls[xbm]?|xlt[x]?|[ct]sv|tab|(?:(?:geo)?j|b)son|(?:x|k|g|nmr|sb|wiley|y[a]?)ml|xsd|o[dt]s|ddi|rdf|[g]?zip|zipx|[rt]ar|[7x]z|[t]?gz|[gb]z[\\d]*|smi[l]?|por|ascii|dta|sav|dat|txt|ti[f]{1,2}|tfw|dwg|nt|fits|feather" + "|svg|sas7b(?:dat|ve)|spss|sas|stata|(?:my|postgre)?sql(?:ite)?|bigquery|sh[px]|sb[xn]|prj|dbf|(?:m|acc)db|mif|mat|pcd|bt|n[sc]?[\\d]*|h[\\d]+|hdf[\\d]*|trs|opj|jcamp|fcs|fas(?:ta)?|keys|values|las|rdata|parquet|avro|sql|dcm|gr[i]?b]|rds" - + "|[p]?cap|dmp|vcf|cbor|biosample|hic|warc|ig[e]?s|sla|dxf|pdb|[sc]df|cif|f(?:ast)?[qa]|apng|sra|vtp|gltf|[sm]tl|ply|abc|md|rtf|ttl|shp|shx|exr|cdf|glb|mtl|kmz|textFile"; + + "|[p]?cap|dmp|vcf|cbor|biosample|hic|warc|ig[e]?s|sla|dxf|pdb|[sc]df|cif|f(?:ast)?[qa]|apng|sra|vtp|gltf|[sm]tl|ply|abc|md|rtf|ttl|shp|shx|exr|cdf|glb|mtl|kmz|textFile)"; public static final Pattern DATASET_URL_FILTER = Pattern.compile(".+(?:dataset[s]?/.*|(?:\\.|format=)" + dataset_formats + "(?:\\?.+)?$)"); diff --git a/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java b/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java index aeb3f4b..d28d7bb 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java +++ b/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java @@ -19,7 +19,7 @@ public class UrlTypeChecker { private static final Logger logger = LoggerFactory.getLogger(UrlTypeChecker.class); - private static final String htOrPhpExtensionsPattern = "(?:[\\w]?ht(?:[\\w]{1,2})?|php[\\d]?)"; + private static final String htOrPhpExtensionsPattern = "(?:[\\w]?ht(?:[\\w]{1,2})?|php[\\d]{0,2})"; private static final String mediaExtensionsPattern = "ico|gif|jpg|jpeg|png|wav|mp3|mp4|webm|mkv|mov"; @@ -28,9 +28,10 @@ public class UrlTypeChecker private static final String docOrDatasetNegativeLookAroundPattern = "(? duplicateUrls = Collections.newSetFromMap(new ConcurrentHashMap()); @@ -53,7 +53,7 @@ public class UrlUtils * @param urlId (it may be null if no id was provided in the input) * @param sourceUrl * @param pageUrl - * @param docUrl + * @param docOrDatasetUrl * @param comment * @param pageDomain (it may be null) * @param isFirstCrossed @@ -65,24 +65,24 @@ public class UrlUtils * @param fileSize * @param fileHash */ - public static void addOutputData(String urlId, String sourceUrl, String pageUrl, String docUrl, String comment, String pageDomain, + public static void addOutputData(String urlId, String sourceUrl, String pageUrl, String docOrDatasetUrl, String comment, String pageDomain, boolean isFirstCrossed, String wasUrlChecked, String wasUrlValid, String wasDocumentOrDatasetAccessible, String wasDirectLink, String couldRetry, Long fileSize, String fileHash) { - String finalDocUrl = docUrl; + String finalDocOrDatasetUrl = docOrDatasetUrl; - if ( !finalDocUrl.equals(duplicateUrlIndicator) ) + if ( !finalDocOrDatasetUrl.equals(duplicateUrlIndicator) ) { - if ( !finalDocUrl.equals(unreachableDocOrDatasetUrlIndicator) ) + if ( !finalDocOrDatasetUrl.equals(unreachableDocOrDatasetUrlIndicator) ) { sumOfDocUrlsFound.incrementAndGet(); - // Remove the "temporalId" from urls for "cleaner" output and "already found docUrl"-matching. These IDs will expire eventually anyway. - String lowerCaseUrl = finalDocUrl.toLowerCase(); + // Remove the "temporalId" from urls for "cleaner" output and "already found docOrDatasetUrl"-matching. These IDs will expire eventually anyway. + String lowerCaseUrl = finalDocOrDatasetUrl.toLowerCase(); if ( lowerCaseUrl.contains("token") || lowerCaseUrl.contains("jsessionid") ) - finalDocUrl = UrlUtils.removeTemporalIdentifier(finalDocUrl); // We send the non-lowerCase-url as we may want to continue with that docUrl in case of an error. + finalDocOrDatasetUrl = UrlUtils.removeTemporalIdentifier(finalDocOrDatasetUrl); // We send the non-lowerCase-url as we may want to continue with that docOrDatasetUrl in case of an error. - if ( isFirstCrossed ) // Add this id, only if this is a first-crossed docUrl. - docOrDatasetUrlsWithIDs.put(finalDocUrl, new IdUrlTuple(urlId, sourceUrl)); // Add it here, in order to be able to recognize it and quick-log it later, but also to distinguish it from other duplicates. + if ( isFirstCrossed ) // Add this id, only if this is a first-crossed docOrDatasetUrl. + docOrDatasetUrlsWithIDs.put(finalDocOrDatasetUrl, new IdUrlTuple(urlId, sourceUrl)); // Add it here, in order to be able to recognize it and quick-log it later, but also to distinguish it from other duplicates. if ( pageDomain == null ) pageDomain = UrlUtils.getDomainStr(pageUrl, null); @@ -91,26 +91,26 @@ public static void addOutputData(String urlId, String sourceUrl, String pageUrl, { // Gather data for the MLA, if we decide to have it enabled. if ( MachineLearning.useMLA ) - MachineLearning.gatherMLData(pageUrl, finalDocUrl, pageDomain); + MachineLearning.gatherMLData(pageUrl, finalDocOrDatasetUrl, pageDomain); - // Add the domains of the pageUrl and the finalDocUrl to the successful domains as both lead in some way to a docUrl. + // Add the domains of the pageUrl and the finalDocOrDatasetUrl to the successful domains as both lead in some way to a docOrDatasetUrl. // The data inside ConcurrentHashMap "domainsAndHits" is used to evaluate how good the domain is doing while is having some problems. // If the "goods" surpass the "bads", then that domain will not get blocked, even if the "minimum-accepted-bad-cases" was exceeded. ConnSupportUtils.countInsertAndGetTimes(domainsAndHits, pageDomain); - // Now if the "finalDocUrl" is different from the "pageUrl", get the domain of the "finalDocUrl" and if it's different, then add it to "domainsAndHits"-HashMap. - if ( !pageUrl.equals(finalDocUrl) ) { - String docUrlDomain = UrlUtils.getDomainStr(finalDocUrl, null); + // Now if the "finalDocOrDatasetUrl" is different from the "pageUrl", get the domain of the "finalDocOrDatasetUrl" and if it's different, then add it to "domainsAndHits"-HashMap. + if ( !pageUrl.equals(finalDocOrDatasetUrl) ) { + String docUrlDomain = UrlUtils.getDomainStr(finalDocOrDatasetUrl, null); if ( (docUrlDomain != null) && !docUrlDomain.equals(pageDomain) ) ConnSupportUtils.countInsertAndGetTimes(domainsAndHits, docUrlDomain); } } } - else // Else if this url is not a docUrl and has not been processed before.. + else // Else if this url is not a docOrDatasetUrl and has not been processed before.. duplicateUrls.add(sourceUrl); // Add it in duplicates BlackList, in order not to be accessed for 2nd time in the future. We don't add docUrls here, as we want them to be separate for checking purposes. } - FileUtils.dataForOutput.add(new DataForOutput(urlId, sourceUrl, pageUrl, finalDocUrl, wasUrlChecked, wasUrlValid, wasDocumentOrDatasetAccessible, wasDirectLink, couldRetry, fileHash, fileSize, comment)); // Log it to be written later in the outputFile. + FileUtils.dataForOutput.add(new DataForOutput(urlId, sourceUrl, pageUrl, finalDocOrDatasetUrl, wasUrlChecked, wasUrlValid, wasDocumentOrDatasetAccessible, wasDirectLink, couldRetry, fileHash, fileSize, comment)); // Log it to be written later in the outputFile. }