Skip to content

Commit

Permalink
Merge pull request #317 from charvolant/master
Browse files Browse the repository at this point in the history
Release 1.4.13
  • Loading branch information
charvolant authored Mar 11, 2021
2 parents ac16d95 + 7c38ed9 commit fd68ddf
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 18 deletions.
6 changes: 3 additions & 3 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ buildscript {
}
}

version "1.4.12"
version "1.4.13"
group "au.org.ala"

apply plugin:"eclipse"
Expand Down Expand Up @@ -72,8 +72,8 @@ dependencies {
compile "org.jsoup:jsoup:1.8.3"
compile 'org.grails.plugins:external-config:1.1.1'

compile group: 'org.grails.plugins', name: 'ala-bootstrap3', version: '3.0.6'
compile(group: 'org.grails.plugins', name: 'ala-auth', version:'3.1.0') {
compile group: 'org.grails.plugins', name: 'ala-bootstrap3', version: '3.2.3'
compile(group: 'org.grails.plugins', name: 'ala-auth', version:'3.1.3') {
exclude group: 'javax.servlet', module: 'servlet-api'
}
compile group: 'org.grails.plugins', name: 'ala-admin-plugin', version: '2.1'
Expand Down
5 changes: 4 additions & 1 deletion grails-app/conf/application.yml
Original file line number Diff line number Diff line change
Expand Up @@ -146,14 +146,17 @@ images:
index: true
wordPress:
service: https://www.ala.org.au
sitemap: /sitemap.xml
sitemap: /xmlsitemap.xml
index: index.xml
page: /?page_id={0}
timeout: 10000
validateTLS: false
titleSelector: head > title
contentSelector: body main
idSelector: head > meta[name=id]
shortLinkSelector: head > link[rel=shortlink]
excludedLocations:
- .*/category/.*
excludedCategories:
- button
contentOnlyParams: ?content-only=1&categories=1
Expand Down
13 changes: 2 additions & 11 deletions grails-app/services/au/org/ala/bie/ImportService.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -543,18 +543,9 @@ class ImportService implements GrailsConfigurationAware {
log.debug documentCount + ". Indexing WP page - id: " + document.id + " | title: " + document.title + " | text: " + StringUtils.substring(document.body, 0, 100) + "... ";
def doc = [:]
doc["idxtype"] = IndexDocType.WORDPRESS.name()

if (StringUtils.isNotBlank(document.shortlink)) {
doc["guid"] = document.shortlink
} else if (StringUtils.isNotEmpty(document.id)) {
doc["guid"] = Encoder.buildServiceUrl(wordPressBaseUrl, wordPressPageFormat, document.id).toExternalForm()
} else {
// fallback
doc["guid"] = pageUrl
}

doc["id"] = "wp" + document.id // probably not needed but safer to leave in
doc["name"] = document.title // , 1.2f
doc["guid"] = pageUrl
doc["name"] = document.title
doc["content"] = document.body
doc["linkIdentifier"] = pageUrl
//doc["australian_s"] = "recorded" // so they appear in default QF search
Expand Down
20 changes: 17 additions & 3 deletions grails-app/services/au/org/ala/bie/WordpressService.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,24 @@ import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.select.Elements

import java.util.function.Predicate
import java.util.regex.Pattern

/**
* Service for accessing Word Press pages
*/
class WordpressService implements IndexingInterface, GrailsConfigurationAware {
String service
String sitemap
String index
int timeout
boolean validateTLS
String titleSelector
String contentSelector
String idSelector
String shortLinkSelector
String contentOnlyParams
List<Predicate<String>> excludedLocations

/**
* Set up service with configuration
Expand All @@ -45,13 +50,15 @@ class WordpressService implements IndexingInterface, GrailsConfigurationAware {
void setConfiguration(Config config) {
this.service = config.wordPress.service
this.sitemap = config.wordPress.sitemap
this.index = config.wordPress.index
this.timeout = config.getProperty("wordPress.timeout", Integer, 10000)
this.validateTLS = config.getProperty("wordPress.validateTLS", Boolean, false)
this.titleSelector = config.wordPress.titleSelector
this.contentSelector = config.wordPress.contentSelector
this.idSelector = config.wordPress.idSelector
this.shortLinkSelector = config.wordPress.shortLinkSelector
this.contentOnlyParams = config.wordPress.contentOnlyParams ?: ""
this.excludedLocations = (config.wordPress.excludedLocations ?: []).collect { Pattern.compile(it).asPredicate() }
}

/**
Expand Down Expand Up @@ -84,15 +91,22 @@ class WordpressService implements IndexingInterface, GrailsConfigurationAware {
Elements sitemaps = doc.select("sitemapindex sitemap loc")
sitemaps.each { loc ->
try {
URL url = new URL(loc.text())
String sitemap = loc.text()
if (sitemap.endsWith('/')) {
sitemap = sitemap + this.index
}
URL url = new URL(sitemap)
queue << url
} catch (MalformedURLException mex) {
} catch (MalformedURLException mex) {
log.warn "Site map URL ${loc.text()} is malformed"
}
}
Elements pages = doc.select("urlset url loc")
pages.each { loc ->
locations << loc.text()
String url = loc.text()
if (!this.excludedLocations.any { it.test(url) }) {
locations << url
}
}
} catch (IOException ex) {
log.warn "Unable to retrieve ${source}: ${ex.message}, ignoring"
Expand Down
1 change: 1 addition & 0 deletions src/test/groovy/au/org/ala/bie/WordPressServiceSpec.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class WordPressServiceSpec extends Specification {
def setupSpec() {
// call web service once only and store results in static vars
service.setConfiguration(grailsApplication.config)
service.sitemap = '/sitemap.xml' // Depends on wordpress implementation!
pages = service.resources("")
firstPageUrl = pages.get(1) // homepage has no body so use second page for testing
firstPageMap = service.getResource(firstPageUrl)
Expand Down

0 comments on commit fd68ddf

Please sign in to comment.