Skip to content

Commit

Permalink
Handle the special domain "scielo.br". It contains an encoded "&" sym…
Browse files Browse the repository at this point in the history
…bol which causes the pdf-links (from the input) to redirect to the landing page and have to extract them from there, instead of acquiring them right away.
  • Loading branch information
LSmyrnaios committed Jan 15, 2024
1 parent 97ed774 commit 0983268
Showing 1 changed file with 38 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ public static String checkAndHandleSpecialUrls(String resourceUrl) throws Runtim
} else if ( (updatedUrl = checkAndHandleOSFurls(resourceUrl)) != null ) {
//logger.debug("OSF-PageURL: " + resourceURL + " to possible-docUrl: " + updatedUrl); // DEBUG!
resourceUrl = updatedUrl;
/*} else if ( (updatedUrl = checkAndHandleWileyUrls(resourceUrl)) != null ) {
//logger.debug("Wiley-PageURL: " + resourceURL + " to possible-docUrl: " + updatedUrl); // DEBUG!
resourceUrl = updatedUrl;*/
} else if ( (updatedUrl = checkAndHandleScieloUrls(resourceUrl)) != null ) {
//logger.debug("Scielo-PageURL: " + resourceURL + " to possible-docUrl: " + updatedUrl); // DEBUG!
resourceUrl = updatedUrl;
} else
resourceUrl = checkAndHandleDergipark(resourceUrl); // It returns the same url if nothing was handled.

Expand Down Expand Up @@ -325,4 +331,36 @@ else if ( !pageUrl.endsWith("/") )
return (pageUrl + "download");
}


////////// onlinelibrary.wiley.com /////////////////
// https://onlinelibrary.wiley.com/doi/10.1111/polp.12377 --> https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/polp.12377
// The https://onlinelibrary.wiley.com/doi/pdf/10.1111/polp.12377 opens a page with JS and auto-redirects (using JS, NOT http-3XX) to the "/pdfdirect/" version.
// Also:
// https://onlinelibrary.wiley.com/doi/10.1111/polp.12377 --> https://onlinelibrary.wiley.com/doi/epdf/10.1111/polp.12377 (page with the pdf in view and a download button)
// -->
// TODO - CHECK IT AGAIN
/*public static String checkAndHandleWileyUrls(String pageUrl)
{
if ( !pageUrl.contains("onlinelibrary.wiley.com/doi/") ) // We want to transform only urls belonging to this subdomain and has this structure.
return null; // It's from another domain, keep looking..
if ( pageUrl.contains("epdf/") ) // It's a script-depending pdf-url which needs transformation.
return StringUtils.replace(pageUrl, "epdf/", "pdfdirect/", 1);
else
return StringUtils.replace(pageUrl, "/doi/", "/doi/pdfdirect/", 1);
}*/


//////////////////////// www.scielo.br ///////////////////////
// https://www.scielo.br/j/bjb/a/64jBbrbZ8hG3fvhy6d6nczj/?amp;format=pdf&lang=en --> REPLACE THE PROBLEMATIC "amp;" with "&".
// https://www.scielo.br/j/bjb/a/64jBbrbZ8hG3fvhy6d6nczj/?&format=pdf&lang=en
// We can remove it, instead of replacing it, as the result is a bit odd, yet valid. BUT, better replace it for consistency.
public static String checkAndHandleScieloUrls(String pageUrl)
{
if ( !pageUrl.contains("scielo.br") ) // We want to transform only urls belonging to this subdomain and has this structure.
return null; // It's from another domain, keep looking..

return StringUtils.replace(pageUrl, "amp;", "&");
}

}

0 comments on commit 0983268

Please sign in to comment.