Skip to content

Commit

Permalink
New configuration
Browse files Browse the repository at this point in the history
  • Loading branch information
diogok committed Dec 8, 2016
1 parent 67b1d86 commit db704b1
Show file tree
Hide file tree
Showing 7 changed files with 59 additions and 24 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ FROM diogok/java8:zulu
WORKDIR /opt
CMD ["java","-server","-XX:+UseConcMarkSweepGC","-XX:+UseCompressedOops","-XX:+DoEscapeAnalysis","-jar","dwc-bot-es.jar"]

ADD target/dwc-bot-es-0.0.4-standalone.jar /opt/dwc-bot-es.jar
ADD target/dwc-bot-es-0.0.5-standalone.jar /opt/dwc-bot-es.jar

24 changes: 19 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
# dwc-bot-es

A bot to read DarwinCore Archives from IPTs and index them on ElasticSearch, indexing Resources, Checklists and Occurrences.
A bot to read DarwinCore Archives (DwC-A) from IPTs and index them on ElasticSearch, indexing Resources, Checklists and Occurrences.

## Deploy

You might need to run this first before starting ElasticSearch:

$ sudo sysctl -w vm.max_map_count=262144

### Run with Docker

Run the docker container

$ docker run -d -volume /etc/biodiv:/etc/biodiv:ro diogok/dwc-bot-es

With docker-compose, including ElasticSearch and Kibana for exploration:
Or with docker-compose, including ElasticSearch and Kibana for exploration:

```yaml
version: "2"
Expand All @@ -35,15 +39,25 @@ services:
Download the latest jar from the [ realases page ](https://github.com/diogok/dwc-bot-es/releases) and run it:
$ java -server -jar dwc-bot-es.jar
$ java -jar dwc-bot-es.jar
### Configuration
It will look for a list of IPTs to crawl in /etc/biodiv/dwc-bot.list or at the directory defined by DWC\_BOT environment variable.
It will look for a configuration file on /etc/biodiv/config.ini or at the file defined by CONFIG environment variable.
The configuration file looks like the following:
ELASTICSEARCH=http://localhost:9200
INDEX=dwc
LOOP=false
ElasticSearch tells to which elasticsearch server to connect. INDEX tells which ElasticSearch index to use. LOOP defines if the it should run in loop(true) or only once(false).
It will also look for a list of IPTs to crawl in /etc/biodiv/dwc-bot.list or at the file defined by DWC\_BOT environment variable.
You can set the ElasticSearch and Index to use with environment variables, such as:
$ DWC_BOT=/etc/biodiv/dwc-bot.list ELASTICSEARCH=http://localhost:9200 INDEX=dwc java -jar dwc-bot-es.jar
$ CONFIG=/etc/biodiv/config.ini DWC_BOT=/etc/biodiv/dwc-bot.list ELASTICSEARCH=http://localhost:9200 INDEX=dwc java -jar dwc-bot-es.jar
Or to run a single(or any) source(s):
Expand Down
6 changes: 3 additions & 3 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ services:
volumes:
- /var/data/dwc-elasticsearch/data:/usr/share/elasticsearch/data:rw
kibana:
image: diogok/kibana
ports:
- 8001:8001
image: diogok/kibana
ports:
- 8001:8001
3 changes: 2 additions & 1 deletion project.clj
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
(defproject dwc-bot-es "0.0.4"
(defproject dwc-bot-es "0.0.5"
:description "Insert DarwinCore Archives Occurrences from IPT into ElasticSearch"
:url "http://github.com/diogok/dwc-bot-es"
:license {:name "MIT"}
Expand All @@ -17,5 +17,6 @@

[com.taoensso/timbre "4.7.4"]
[environ "1.1.0"]]
:repositories [["clojars" {:sign-releases false}]]
:source-paths ["src"]
:profiles {:uberjar {:aot :all}})
3 changes: 3 additions & 0 deletions resources/config.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
ELASTICSEARCH=http://localhost:9200
INDEX=dwc
LOOP=false
37 changes: 27 additions & 10 deletions src/dwc_bot_es/config.clj
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,26 @@
(:require [environ.core :refer (env)])
(:require [taoensso.timbre :as log]))

(def es (or (env :elasticsearch) "http://localhost:9200"))
(def index (or (env :index) "dwc"))
(def should-loop (= "true" (or (env :loop) "false")))
(defn config-file
[]
(let [env (io/file (or (env :config) "/etc/biodiv/config.ini"))
base (io/resource "config.ini")]
(if (.exists env)
env
base)))

(defn cfg
([]
(with-open [rdr (io/reader (config-file))]
(->> (line-seq rdr)
(map #(.trim %))
(filter #(and (not (nil? %)) (not (empty? %))))
(map (fn [line] ( .split line "=" )))
(map (fn [pair] [(keyword (.toLowerCase (.trim (first pair)))) (.trim (last pair))]))
(map (fn [kv] {(first kv) (or (env (first kv)) (last kv))}))
(reduce merge {}))))
([k] ((cfg) k)))


(defn load-base-inputs-0
"Load a config file list into a list"
Expand Down Expand Up @@ -37,16 +54,16 @@
(let [done (atom false)]
(while (not @done)
(try
(log/info (str "Waiting: " es))
(let [r (http/get es {:throw-exceptions false})]
(log/info (str "Waiting: " (cfg :elasticsearch)))
(let [r (http/get (cfg :elasticsearch) {:throw-exceptions false})]
(if (= 200 (:status r))
(reset! done true)
(Thread/sleep 1000)))
(catch Exception e
(do
(log/warn (.toString e))
(Thread/sleep 1000)))))
(log/info (str "Done: " es))))
(log/info (str "Done: " (cfg :elasticsearch)))))

(defn setup
([] (setup "occurrence")
Expand All @@ -55,16 +72,16 @@
(wait-es)
(let [mapping (slurp (io/resource (str row-type "_mapping.json" )))]
(try
(let [r-idx (http/get (str es "/" index) {:throw-exceptions false})
r-typ (http/get (str es "/" index "/_mapping/" row-type) {:throw-exceptions false})]
(let [r-idx (http/get (str (cfg :elasticsearch) "/" (cfg :index)) {:throw-exceptions false})
r-typ (http/get (str (cfg :elasticsearch) "/" (cfg :index) "/_mapping/" row-type) {:throw-exceptions false})]
(if (= 404 (:status r-idx))
(log/info
(:body
(http/put (str es "/" index) {:throw-exceptions false}))))
(http/put (str (cfg :elasticsearch) "/" (cfg :index)) {:throw-exceptions false}))))
(if (or (= 404 (:status r-typ)) (= "{}" (:body r-typ)))
(log/info
(:body
(http/put (str es "/" index "/_mapping/" row-type)
(http/put (str (cfg :elasticsearch) "/" (cfg :index) "/_mapping/" row-type)
{:body mapping
:throw-exceptions false
:headers {"Content-Type" "application/json"}})))))
Expand Down
8 changes: 4 additions & 4 deletions src/dwc_bot_es/core.clj
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@
[src row-type row]
"Prepare to send to elasticsearch"
(let [doc (metadata src row)]
[{:index {:_index config/index :_type (name row-type) :_id (:id doc)}}
[{:index {:_index (config/cfg :index) :_type (name row-type) :_id (:id doc)}}
doc]))

(defn fix
Expand All @@ -98,7 +98,7 @@
(when (> (count rows) 0)
(let [body (make-body src row-type rows)]
(try
(http/post (str config/es "/" config/index "/_bulk") {:body body})
(http/post (str (config/cfg :elasticsearch) "/" (config/cfg :index) "/_bulk") {:body body})
(log/info "Saved" (count rows) " " (name row-type) " from " src)
(catch Exception e
(do (log/warn "Error saving" (.getMessage e))
Expand All @@ -110,7 +110,7 @@
[row-type rec]
(let [source (:link rec)]
(log/info "->" source)
(http/post (str config/es "/" config/index "/resource/" (source-id (:link rec)))
(http/post (str (config/cfg :elasticsearch) "/" (config/cfg :index) "/resource/" (source-id (:link rec)))
{:body (json/write-str (assoc rec :resource (source-name (:link rec)) :id (source-id (:link rec))))})
(let [waiter (chan 1)
batch (batcher {:size (* 1 1024)
Expand All @@ -131,7 +131,7 @@
(let [looping (atom true)]
(while @looping
(do
(swap! looping (fn [_] config/should-loop))
(swap! looping (fn [_] (= "true" (config/cfg :loop))))
(log/info "Bot Active")
(let [recs (apply all-resources args)]
(log/info "Got" (count recs) "resources")
Expand Down

0 comments on commit db704b1

Please sign in to comment.