diff --git a/build.gradle b/build.gradle index 04d1196d..cac18797 100644 --- a/build.gradle +++ b/build.gradle @@ -11,7 +11,7 @@ buildscript { } } -version "1.4.12" +version "1.4.13" group "au.org.ala" apply plugin:"eclipse" @@ -72,8 +72,8 @@ dependencies { compile "org.jsoup:jsoup:1.8.3" compile 'org.grails.plugins:external-config:1.1.1' - compile group: 'org.grails.plugins', name: 'ala-bootstrap3', version: '3.0.6' - compile(group: 'org.grails.plugins', name: 'ala-auth', version:'3.1.0') { + compile group: 'org.grails.plugins', name: 'ala-bootstrap3', version: '3.2.3' + compile(group: 'org.grails.plugins', name: 'ala-auth', version:'3.1.3') { exclude group: 'javax.servlet', module: 'servlet-api' } compile group: 'org.grails.plugins', name: 'ala-admin-plugin', version: '2.1' diff --git a/grails-app/conf/application.yml b/grails-app/conf/application.yml index 350b3863..40ee0a1f 100644 --- a/grails-app/conf/application.yml +++ b/grails-app/conf/application.yml @@ -146,7 +146,8 @@ images: index: true wordPress: service: https://www.ala.org.au - sitemap: /sitemap.xml + sitemap: /xmlsitemap.xml + index: index.xml page: /?page_id={0} timeout: 10000 validateTLS: false @@ -154,6 +155,8 @@ wordPress: contentSelector: body main idSelector: head > meta[name=id] shortLinkSelector: head > link[rel=shortlink] + excludedLocations: + - .*/category/.* excludedCategories: - button contentOnlyParams: ?content-only=1&categories=1 diff --git a/grails-app/services/au/org/ala/bie/ImportService.groovy b/grails-app/services/au/org/ala/bie/ImportService.groovy index 072515fb..a64a36d9 100644 --- a/grails-app/services/au/org/ala/bie/ImportService.groovy +++ b/grails-app/services/au/org/ala/bie/ImportService.groovy @@ -543,18 +543,9 @@ class ImportService implements GrailsConfigurationAware { log.debug documentCount + ". Indexing WP page - id: " + document.id + " | title: " + document.title + " | text: " + StringUtils.substring(document.body, 0, 100) + "... "; def doc = [:] doc["idxtype"] = IndexDocType.WORDPRESS.name() - - if (StringUtils.isNotBlank(document.shortlink)) { - doc["guid"] = document.shortlink - } else if (StringUtils.isNotEmpty(document.id)) { - doc["guid"] = Encoder.buildServiceUrl(wordPressBaseUrl, wordPressPageFormat, document.id).toExternalForm() - } else { - // fallback - doc["guid"] = pageUrl - } - doc["id"] = "wp" + document.id // probably not needed but safer to leave in - doc["name"] = document.title // , 1.2f + doc["guid"] = pageUrl + doc["name"] = document.title doc["content"] = document.body doc["linkIdentifier"] = pageUrl //doc["australian_s"] = "recorded" // so they appear in default QF search diff --git a/grails-app/services/au/org/ala/bie/WordpressService.groovy b/grails-app/services/au/org/ala/bie/WordpressService.groovy index a401b5c9..0f9a7d15 100644 --- a/grails-app/services/au/org/ala/bie/WordpressService.groovy +++ b/grails-app/services/au/org/ala/bie/WordpressService.groovy @@ -22,12 +22,16 @@ import org.jsoup.Jsoup import org.jsoup.nodes.Document import org.jsoup.select.Elements +import java.util.function.Predicate +import java.util.regex.Pattern + /** * Service for accessing Word Press pages */ class WordpressService implements IndexingInterface, GrailsConfigurationAware { String service String sitemap + String index int timeout boolean validateTLS String titleSelector @@ -35,6 +39,7 @@ class WordpressService implements IndexingInterface, GrailsConfigurationAware { String idSelector String shortLinkSelector String contentOnlyParams + List> excludedLocations /** * Set up service with configuration @@ -45,6 +50,7 @@ class WordpressService implements IndexingInterface, GrailsConfigurationAware { void setConfiguration(Config config) { this.service = config.wordPress.service this.sitemap = config.wordPress.sitemap + this.index = config.wordPress.index this.timeout = config.getProperty("wordPress.timeout", Integer, 10000) this.validateTLS = config.getProperty("wordPress.validateTLS", Boolean, false) this.titleSelector = config.wordPress.titleSelector @@ -52,6 +58,7 @@ class WordpressService implements IndexingInterface, GrailsConfigurationAware { this.idSelector = config.wordPress.idSelector this.shortLinkSelector = config.wordPress.shortLinkSelector this.contentOnlyParams = config.wordPress.contentOnlyParams ?: "" + this.excludedLocations = (config.wordPress.excludedLocations ?: []).collect { Pattern.compile(it).asPredicate() } } /** @@ -84,15 +91,22 @@ class WordpressService implements IndexingInterface, GrailsConfigurationAware { Elements sitemaps = doc.select("sitemapindex sitemap loc") sitemaps.each { loc -> try { - URL url = new URL(loc.text()) + String sitemap = loc.text() + if (sitemap.endsWith('/')) { + sitemap = sitemap + this.index + } + URL url = new URL(sitemap) queue << url - } catch (MalformedURLException mex) { + } catch (MalformedURLException mex) { log.warn "Site map URL ${loc.text()} is malformed" } } Elements pages = doc.select("urlset url loc") pages.each { loc -> - locations << loc.text() + String url = loc.text() + if (!this.excludedLocations.any { it.test(url) }) { + locations << url + } } } catch (IOException ex) { log.warn "Unable to retrieve ${source}: ${ex.message}, ignoring" diff --git a/src/test/groovy/au/org/ala/bie/WordPressServiceSpec.groovy b/src/test/groovy/au/org/ala/bie/WordPressServiceSpec.groovy index 231953f6..f7679b12 100644 --- a/src/test/groovy/au/org/ala/bie/WordPressServiceSpec.groovy +++ b/src/test/groovy/au/org/ala/bie/WordPressServiceSpec.groovy @@ -33,6 +33,7 @@ class WordPressServiceSpec extends Specification { def setupSpec() { // call web service once only and store results in static vars service.setConfiguration(grailsApplication.config) + service.sitemap = '/sitemap.xml' // Depends on wordpress implementation! pages = service.resources("") firstPageUrl = pages.get(1) // homepage has no body so use second page for testing firstPageMap = service.getResource(firstPageUrl)