Skip to content

Commit

Permalink
Merge pull request #314 from charvolant/issue-311
Browse files Browse the repository at this point in the history
Improve wordpress crawl
  • Loading branch information
charvolant authored Mar 1, 2021
2 parents 200825e + e87be2b commit defa3de
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 4 deletions.
5 changes: 4 additions & 1 deletion grails-app/conf/application.yml
Original file line number Diff line number Diff line change
Expand Up @@ -146,14 +146,17 @@ images:
index: true
wordPress:
service: https://www.ala.org.au
sitemap: /sitemap.xml
sitemap: /xmlsitemap.xml
index: index.xml
page: /?page_id={0}
timeout: 10000
validateTLS: false
titleSelector: head > title
contentSelector: body main
idSelector: head > meta[name=id]
shortLinkSelector: head > link[rel=shortlink]
excludedLocations:
- .*/category/.*
excludedCategories:
- button
contentOnlyParams: ?content-only=1&categories=1
Expand Down
20 changes: 17 additions & 3 deletions grails-app/services/au/org/ala/bie/WordpressService.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,24 @@ import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.select.Elements

import java.util.function.Predicate
import java.util.regex.Pattern

/**
* Service for accessing Word Press pages
*/
class WordpressService implements IndexingInterface, GrailsConfigurationAware {
String service
String sitemap
String index
int timeout
boolean validateTLS
String titleSelector
String contentSelector
String idSelector
String shortLinkSelector
String contentOnlyParams
List<Predicate<String>> excludedLocations

/**
* Set up service with configuration
Expand All @@ -45,13 +50,15 @@ class WordpressService implements IndexingInterface, GrailsConfigurationAware {
void setConfiguration(Config config) {
this.service = config.wordPress.service
this.sitemap = config.wordPress.sitemap
this.index = config.wordPress.index
this.timeout = config.getProperty("wordPress.timeout", Integer, 10000)
this.validateTLS = config.getProperty("wordPress.validateTLS", Boolean, false)
this.titleSelector = config.wordPress.titleSelector
this.contentSelector = config.wordPress.contentSelector
this.idSelector = config.wordPress.idSelector
this.shortLinkSelector = config.wordPress.shortLinkSelector
this.contentOnlyParams = config.wordPress.contentOnlyParams ?: ""
this.excludedLocations = (config.wordPress.excludedLocations ?: []).collect { Pattern.compile(it).asPredicate() }
}

/**
Expand Down Expand Up @@ -84,15 +91,22 @@ class WordpressService implements IndexingInterface, GrailsConfigurationAware {
Elements sitemaps = doc.select("sitemapindex sitemap loc")
sitemaps.each { loc ->
try {
URL url = new URL(loc.text())
String sitemap = loc.text()
if (sitemap.endsWith('/')) {
sitemap = sitemap + this.index
}
URL url = new URL(sitemap)
queue << url
} catch (MalformedURLException mex) {
} catch (MalformedURLException mex) {
log.warn "Site map URL ${loc.text()} is malformed"
}
}
Elements pages = doc.select("urlset url loc")
pages.each { loc ->
locations << loc.text()
String url = loc.text()
if (!this.excludedLocations.any { it.test(url) }) {
locations << url
}
}
} catch (IOException ex) {
log.warn "Unable to retrieve ${source}: ${ex.message}, ignoring"
Expand Down
1 change: 1 addition & 0 deletions src/test/groovy/au/org/ala/bie/WordPressServiceSpec.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class WordPressServiceSpec extends Specification {
def setupSpec() {
// call web service once only and store results in static vars
service.setConfiguration(grailsApplication.config)
service.sitemap = '/sitemap.xml' // Depends on wordpress implementation!
pages = service.resources("")
firstPageUrl = pages.get(1) // homepage has no body so use second page for testing
firstPageMap = service.getResource(firstPageUrl)
Expand Down

0 comments on commit defa3de

Please sign in to comment.