Skip to content

Commit

Permalink
#120 index specieslists, faster collectory indexing, #50 index additi…
Browse files Browse the repository at this point in the history
…onal biocollect fields
  • Loading branch information
Adam Collins committed Jan 2, 2024
1 parent 6428e26 commit 62295d4
Show file tree
Hide file tree
Showing 10 changed files with 247 additions and 39 deletions.
9 changes: 5 additions & 4 deletions build.gradle
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
buildscript {
version "3.1.0-SNAPSHOT"
group "au.org.ala"
}

plugins {
id "groovy"
id "org.grails.grails-gsp"
Expand All @@ -14,10 +19,6 @@ plugins {
id "maven-publish"
}

version "3.0.0-SNAPSHOT"
group "au.org.ala"


publishing {
targetCompatibility = 1.11
repositories {
Expand Down
11 changes: 7 additions & 4 deletions grails-app/conf/application.yml
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ collectory:
base: https://collections.ala.org.au
service: https://collections.ala.org.au/ws
resources: /{0}
find: /find/{0}
# A list of uids for the national species list datasets
nationalSpeciesDatsets:
#- dr2699 - dr2700 -dr2702 -dr2704 -dr2703 -dr3118
Expand Down Expand Up @@ -180,8 +181,10 @@ knowledgeBase:
sectionSelector: section.article-list .list-lead > a
articleCssSelector: .article-title a.c-link
lists:
service: https://lists.ala.org.au/ws
items: /speciesListItems/{0}?includeKVP=true&max={1}&offset={2}
service: https://lists.ala.org.au
items: /ws/speciesListItems/{0}?includeKVP=true&max={1}&offset={2}
search: /ws/speciesList/?isAuthoritative=eq:true&max={0}&offset={1}
show: /speciesListItem/list/{0}
add: /createItem
remove: /deleteItem
biocollect:
Expand Down Expand Up @@ -276,8 +279,8 @@ skin:
orgNameLong: Atlas of Living Australia
useLegacyAuto: false
import:
sequence: collectory,taxonomy-all,vernacular,denormalise,layers,regions,localities,conservation-lists,wordpress,knowledgebase,biocollect,favourites,weights,link-identifiers,images,occurrences,hidden-images,wiki-urls,suggest-index,sitemap,swap
sequenceDaily: conservation-lists,wordpress,knowledgebase,biocollect,favourites,suggest-index,images,hidden-images,wiki-urls,sitemap,swap
sequence: collectory,taxonomy-all,vernacular,denormalise,layers,regions,localities,conservation-lists,wordpress,knowledgebase,biocollect,species-lists,favourites,weights,link-identifiers,images,occurrences,hidden-images,wiki-urls,suggest-index,sitemap,swap
sequenceDaily: conservation-lists,wordpress,knowledgebase,biocollect,species-lists,favourites,suggest-index,images,hidden-images,wiki-urls,sitemap,swap
sequenceWeekly: occurrences,layers,regions,localities,suggest-index,sitemap,swap
# enable daily and weekly tasks
enableTasks: false
Expand Down
13 changes: 12 additions & 1 deletion grails-app/controllers/au/org/ala/bie/ImportController.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,18 @@ class ImportController {
// Documented in openapi.yml, not migrating to annotations because it is not intended for external use.
def importConservationSpeciesLists(){
boolean online = params.getBoolean('online', false)
def job = execute("importConsevationSpeciesLists", "admin.button.importlistconservatioon", { importService.importConservationSpeciesLists(online) })
def job = execute("importConservationSpeciesLists", "admin.button.importlistconservation", { importService.importConservationSpeciesLists(online) })
asJson(job.status())
}

/**
* Import/index Species Lists
*
* @return
*/
def importSpeciesLists(){
boolean online = params.getBoolean('online', false)
def job = execute("importSpeciesLists", "admin.button.importspecieslists", { importService.importSpeciesLists(online) })
asJson(job.status())
}

Expand Down
19 changes: 19 additions & 0 deletions grails-app/i18n/messages.properties
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ admin.button.importalldwca=Import all DwCAs
admin.button.importcollectory=Import collectory
admin.button.importdwca=Import DwCA
admin.button.importlayer=Import layer information
admin.button.importspecieslists=Import species lists
admin.button.importlistconservation=Import conservation species lists
admin.button.importlistvernacular=Import vernacular name species lists
admin.button.importlistwiki=Import wiki url species lists
Expand Down Expand Up @@ -184,3 +185,21 @@ title.conjunctions=and, or, nor, but, for, yet, so
title.articles=a, an, the
title.prepositions=to, for, by, at, in, on, per, of, from
title.initials=d', O'

list.content.listType=type
list.content.itemCount=items
list.content.dateCreated=created
list.content.isAuthoritative=
list.content.isInvasive=invasive list
list.content.isThreatened=threatened list
list.content.region=region
list.SPECIES_CHARACTERS=Species characters list
list.CONSERVATION_LIST=Conservation list
list.SENSITIVE_LIST=List of sensitive species
list.LOCAL_LIST=Area checklist
list.COMMON_TRAIT=Common trait of species
list.COMMON_HABITAT=Common habitat of species
list.SPATIAL_PORTAL=Spatial portal defined
list.PROFILE=Profile list
list.TEST=Test list
list.OTHER=Other list
7 changes: 0 additions & 7 deletions grails-app/services/au/org/ala/bie/BiocollectService.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,7 @@

package au.org.ala.bie

import au.org.ala.bie.indexing.IndexingInterface
import au.org.ala.bie.util.Encoder
import grails.converters.JSON
import groovy.json.JsonSlurper
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.jsoup.select.Elements

/**
* Service to index Biocollect projects
Expand Down
31 changes: 31 additions & 0 deletions grails-app/services/au/org/ala/bie/CollectoryService.groovy
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package au.org.ala.bie

import au.org.ala.bie.util.Encoder
import grails.converters.JSON
import groovy.json.JsonSlurper

/**
Expand All @@ -9,6 +10,8 @@ import groovy.json.JsonSlurper
class CollectoryService {
def grailsApplication

def useOldCollectory = false

/**
* Get a list of available collectory resources of a specific type
*
Expand All @@ -35,4 +38,32 @@ class CollectoryService {
def json = slurper.parseText(url.toURL().getText('UTF-8'))
return json
}

def getBatch(List resources, entityType) {
if (!useOldCollectory) {
try {
def url = Encoder.buildServiceUrl(grailsApplication.config.collectory.service, grailsApplication.config.collectory.find, entityType)
def bytes = (resources.collect { it.uid } as JSON).toString().getBytes("UTF-8")

HttpURLConnection conn = (HttpURLConnection) url.openConnection()
conn.setRequestMethod("POST")
conn.setRequestProperty("Content-Type", "application/json")
conn.setRequestProperty("Content-Length", String.valueOf(bytes.length))
conn.setDoOutput(true)
conn.getOutputStream().write(bytes)

def txt = conn.getInputStream().text
def response = JSON.parse(txt)

conn.disconnect()

return response.collect { JSON.parse(it) }
} catch (ignored) {
useOldCollectory = true
}
}

// fallback to old collectory compatible request
resources.collect { get(it.uri) }
}
}
154 changes: 131 additions & 23 deletions grails-app/services/au/org/ala/bie/ImportService.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,10 @@ import org.gbif.dwca.io.ArchiveFile
import org.gbif.dwca.record.Record
import org.gbif.dwca.record.StarRecord
import org.gbif.nameparser.PhraseNameParser
import org.springframework.context.MessageSource
import org.springframework.context.i18n.LocaleContextHolder

import java.text.MessageFormat
import java.text.SimpleDateFormat
import java.util.regex.Pattern
import java.util.zip.GZIPInputStream
Expand Down Expand Up @@ -116,6 +119,8 @@ class ImportService implements GrailsConfigurationAware {
def speciesGroupService
def conservationListsSource
def jobService
def grailsApplication
MessageSource messageSource

def brokerMessagingTemplate

Expand Down Expand Up @@ -287,6 +292,9 @@ class ImportService implements GrailsConfigurationAware {
case 'biocollect':
importBiocollectProjects(online)
break
case 'species-lists':
importSpeciesLists(online)
break
case 'swap':
indexService.swap()
break
Expand Down Expand Up @@ -500,39 +508,64 @@ class ImportService implements GrailsConfigurationAware {
indexService.deleteFromIndex(indexDocType, online)
log("Cleared")

drLists.each {
def details = collectoryService.get(it.uri)
def doc = [:]
doc["id"] = it.uri
doc["datasetID"] = details.uid
doc["guid"] = details.alaPublicUrl
doc["idxtype"] = indexDocType.name()
doc["name"] = details.name
doc["description"] = details.pubDescription
doc["distribution"] = "N/A"

if (details.rights)
doc["rights"] = details.rights
if (details.licenseType)
doc["license"] = (details.licenseType + " " + details.licenseVersion ?: "").trim()
if (details.acronym)
doc["acronym"] = details.acronym

entities << doc

if (entities.size() > BUFFER_SIZE) {
indexService.indexBatch(entities, online)
entities.clear()
def maxSize = 100
def batch = []
def uriMap = [:]

drLists.each { dr ->
batch.add(dr)
uriMap[dr.uid] = dr.uri

if (batch.size() == maxSize) {
collectoryBatch(batch, entities, online, entityType, indexDocType, uriMap)
batch.clear()
}
}

if (batch) {
collectoryBatch(batch, entities, online, entityType, indexDocType, uriMap)
}

if (entities) {
indexService.indexBatch(entities, online)
}

log("Finished indexing ${drLists.size()} ${entityType}")
}
log "Finished collectory import"
}

def collectoryBatch(batch, entities, online, entityType, indexDocType, uriMap) {
def drs = collectoryService.getBatch(batch, entityType)

drs.each { details ->
def doc = [:]
doc["id"] = uriMap[details.uid]
doc["datasetID"] = details.uid
doc["guid"] = details.alaPublicUrl
doc["idxtype"] = indexDocType.name()
doc["name"] = details.name?.trim()
doc["description"] = details.pubDescription?.trim()
doc["distribution"] = "N/A"

if (details.rights)
doc["rights"] = details.rights
if (details.licenseType)
doc["license"] = (details.licenseType + " " + details.licenseVersion ?: "").trim()
if (details.acronym)
doc["acronym"] = details.acronym
if (details.logoRef?.uri)
doc["image"] = details.logoRef.uri

entities << doc

if (entities.size() > BUFFER_SIZE) {
indexService.indexBatch(entities, online)
entities.clear()
}
}
}

/**
* Index WordPress pages
*/
Expand Down Expand Up @@ -674,6 +707,13 @@ class ImportService implements GrailsConfigurationAware {
doc["name"] = project.name
doc["content"] = project.description?:""
doc["linkIdentifier"] = project.url

doc["projectType_s"] = project.projectType
if (project.urlImage) doc["image"] = project.urlImage
doc["containsActivity_s"] = project.containsActivity
doc["dateCreated_s"] = project.dateCreated
if (project.keywords) doc["keywords_s"] = project.keywords

// add to doc to buffer (List)
buffer << doc
// update progress bar (number output only)
Expand All @@ -696,6 +736,74 @@ class ImportService implements GrailsConfigurationAware {
log "Finished biocollect import"
}

/**
* Index Species lists
*/
def importSpeciesLists(boolean online) throws Exception {
log "Starting species lists import."

// get List of species lists
def lists = listService.resources()
def documentCount = 0
def totalDocs = lists.size()
def buffer = []
log("Species lists found: ${totalDocs}") // update user via socket

// slurp and build each SOLR doc (add to buffer)
lists.each { list ->
def url = MessageFormat.format(grailsApplication.config.lists.service + grailsApplication.config.lists.show, list.dataResourceUid)
log "indexing url: ${url}"
try {
documentCount++

// create SOLR doc
log.debug documentCount + ". Indexing Species lists - id: " + list.dataResourceUid + " | title: " + list.listName + "... ";
def doc = [:]
doc["idxtype"] = IndexDocType.SPECIESLIST.name()
doc["guid"] = url
doc["id"] = list.dataResourceUid // guid required
doc["name"] = list.listName
doc["linkIdentifier"] = url

doc["listType_s"] = list.listType
def content = messageSource.getMessage('list.content.listType', null, LocaleContextHolder.locale) + ": " +
messageSource.getMessage("list." + list.listType, null, LocaleContextHolder.locale)

['dateCreated', 'itemCount', 'isAuthoritative', 'isInvasive', 'isThreatened', 'region'].each {item ->
def label = messageSource.getMessage('list.content.' + item, null, LocaleContextHolder.locale)
if (label && list[item]) {
if ("true" == list[item].toString()) {
content += ', ' + label
} else {
content += ', ' + label + ": " + list[item]
}
doc[item + "_s"] = list[item]
}
}

doc["content"] = content
// add to doc to buffer (List)
buffer << doc
// update progress bar (number output only)
if (documentCount > 0) {
updateProgressBar(totalDocs, documentCount)
}
} catch (IOException ex) {
// catch it so we don't stop indexing other pages
log("Problem accessing/reading Species lists <${project.url}>: " + ex.getMessage() + " - document skipped")
log.warn(ex.getMessage(), ex)
}
}
log("Committing to ${buffer.size()} documents to SOLR...")
if (online) {
log "Search for species lists may be temporarily unavailable"
}
indexService.deleteFromIndex(IndexDocType.SPECIESLIST, online)
indexService.indexBatch(buffer, online)
updateProgressBar(100, 100) // complete progress bar
log "Finished species lists import"
}

/**
* Index Knowledge Base pages.
*/
Expand Down
Loading

0 comments on commit 62295d4

Please sign in to comment.