Update and improve crawler configuration

commoncrawl · May 22, 2019 · fc91ef7 · fc91ef7
1 parent cb46d62
commit fc91ef7
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 11 deletions.
diff --git a/bin/es_status b/bin/es_status
@@ -83,9 +83,9 @@ function ____show_help() {
     echo "      bulk setting of next fetch time (or count/search to inspect impact before)"
     echo
     echo " index_size"
-    echo "      current size (number of documents) of the status index"
+    echo "      show current size (number of documents) of the status index"
     echo " index_size_gb"
-    echo "      current size (GB) of the status index"
+    echo "      show current size (storage in GiB) of the status index"
     echo
     echo "Environment variables"
     echo " ES_STATUS_URL  URL to query status index, default: ${__ES_STATUS_URL_DEFAULT}"
@@ -606,17 +606,20 @@ function __set_metadata_url () {
         }'
 }
 
+# get index statistics
 function __index_stats() {
-    $SHOW_COMMAND
     FILTER="${1:-.}"
+    $SHOW_COMMAND
     $CURL -XGET $ES_STATUS_URL'/_stats' \
           | jq "$FILTER"
 }
 
+# show index size (number of documents)
 function __index_size() {
     __index_stats ._all.primaries.docs.count
 }
 
+# show index size (storage in GiB)
 function __index_size_gb() {
     __index_stats "._all.primaries.store.size_in_bytes | (. / (1024*1024*1024))"
 }

diff --git a/conf/crawler-conf.yaml b/conf/crawler-conf.yaml
@@ -61,10 +61,6 @@ config:
   # do not fail on unknown SSL certificates
   http.trust.everything: true
 
-  # use okhttp
-  http.protocol.implementation: com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol
-  https.protocol.implementation: com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol
-
   # delay between successive requests to the same host
   # (be defensive, a delay of 5 sec. means about 1000 fetches per hour
   #  which should be enough even for large news sites)
@@ -79,7 +75,7 @@ config:
   # limit the number of queued URLs
   # - avoid duplicate fetches (queues are not sets)
   fetcher.max.queue.size: 10
-  fetcher.max.urls.in.queues: 3000
+  fetcher.max.urls.in.queues: 6000
 
   # fetch Scheduler implementation
   scheduler.class: "com.digitalpebble.stormcrawler.persistence.AdaptiveScheduler"
@@ -118,6 +114,7 @@ config:
   fetchInterval.FETCH_ERROR.isFeed=true: 4320
   fetchInterval.FETCH_ERROR.isSitemapNews=true: 4320
   fetchInterval.FETCH_ERROR.isSitemapIndex=true: 4320
+
   # try to re-fetch feeds or sitemaps with status 404 or equiv. after 2 months
   # (status is neither FETCHED, REDIRECTION, or DISCOVERED)
   fetchInterval.ERROR.isFeed=true: 86400

diff --git a/conf/es-conf.yaml b/conf/es-conf.yaml
@@ -30,7 +30,7 @@ config:
   # positive or negative filter parsable by the Lucene Query Parser
   # es.status.filterQuery: "-(metadata.hostname:stormcrawler.net)"
 
-  # time in secs for which the URLs will be considered for fetching after a ack of fail
+  # time in secs for which the URLs will be considered for fetching after a ack or fail
   # need a high value to avoid duplicates by URLs added multiple times to the fetcher
   # queues, should be close to
   #    fetcher.max.crawl.delay * fetcher.max.queue.size
@@ -44,7 +44,7 @@ config:
   # might be returned.
   # - should reset to avoid that a bucket with many URLs blocks incrementing the date to look
   #   for next fetches for a specific bucket. May happen if a news sitemap adds 1000s of URLs.
-  spout.reset.fetchdate.after: 1200
+  spout.reset.fetchdate.after: 240
 
   es.status.max.buckets: 200
   # max. URLs per bucket (= domain name): 30 sec. / 5 sec. fetch delay = 6
@@ -58,7 +58,7 @@ config:
   # field to sort the buckets
   es.status.global.sort.field: "nextFetchDate"
 
-    # CollapsingSpout : limits the deep paging by resetting the start offset for the ES query
+  # CollapsingSpout : limits the deep paging by resetting the start offset for the ES query
   es.status.max.start.offset: 500
 
   # AggregationSpout : sampling improves the performance on large crawls