diff --git a/docs/conf.py b/docs/conf.py index 5147a87f..e6fb75b3 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -57,7 +57,7 @@ # General information about the project. project = u'Invenio-Stats' -copyright = u'2017, CERN' +copyright = u'2020, CERN' author = u'CERN' # The version info for the project you're documenting, acts as replacement for diff --git a/docs/index.rst b/docs/index.rst index 0e865347..8ff24a2f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -22,6 +22,7 @@ Invenio-Stats. overview configuration usage + operations examplesapp diff --git a/docs/operations.rst b/docs/operations.rst new file mode 100644 index 00000000..d8217c05 --- /dev/null +++ b/docs/operations.rst @@ -0,0 +1,121 @@ +.. + This file is part of Invenio. + Copyright (C) 2016-2020 CERN. + + Invenio is free software; you can redistribute it and/or modify it + under the terms of the MIT License; see LICENSE file for more details. + +Operations +========== + +Since our only copy of stats is stored in the indices of Elasticsearch in case +of a cluster error or failure we will lose our stats data. Thus it is advised +to setup a backup/restore mechanism for projects in production. + +We have several options when it comes down to tooling and methods for preserving +our data in Elasticsearch. + +- `elasticdump `_ + A simple and straight forward tool to for moving and saving indices. +- `Elasticsearch Snapshots `_ + is a tool that takes snapshots of our cluster. Snapshots are build in incremental + fashion so current snapshots do not include data from previous ones. + We can also take snapshots of individual indices or the whole cluster. +- `Curator `_ + is an advanced python library from elastic, you can read more about + curator and how to configure and use it, in the official `Elasticsearch + documentation `_ +- Not recommended, but if you want, you can even keep raw filesystem backups for + each of your elasticsearch nodes. + +Demonstrating all the aforementioned tools falls out of the scope of this +guide so we will provide examples only for elasticdump. + +.. note:: + To give you a magnitude of the produced data for stats, `Zenodo `_ + for January 2020, got approximately **3M** visits (combined harvesters and users), + which produced approximately **10Gb** of stats data. + + +Backup with elasticdump +~~~~~~~~~~~~~~~~~~~~~~~ + +.. note:: + Apart from the data, you will also have to backup the mappings, so you are + able to restore data properly. The following example will backup only stats + for record-views (not the events), you can go through your indices and + select which ones make sense to backup. + + +Save our mappings and our index data to record_view_mapping_backup.json and +record_view_index_backup.json files respectively. + +.. code-block:: console + + $ elasticdump \ + > --input=http://localhost:9200/stats-record-view-2020-03 \ + > --output=record_view_mapping_backup.json \ + > --type=mapping + + Fri, 13 Mar 2020 13:13:01 GMT | starting dump + Fri, 13 Mar 2020 13:13:01 GMT | got 1 objects from source elasticsearch (offset: 0) + Fri, 13 Mar 2020 13:13:01 GMT | sent 1 objects to destination file, wrote 1 + Fri, 13 Mar 2020 13:13:01 GMT | got 0 objects from source elasticsearch (offset: 1) + Fri, 13 Mar 2020 13:13:01 GMT | Total Writes: 1 + Fri, 13 Mar 2020 13:13:01 GMT | dump complete + + $ elasticdump \ + > --input=http://localhost:9200/stats-record-view-2020-03 \ + > --output=record_view_index_backup.json \ + > --type=data + + Fri, 13 Mar 2020 13:13:13 GMT | starting dump + Fri, 13 Mar 2020 13:13:13 GMT | got 5 objects from source elasticsearch (offset: 0) + Fri, 13 Mar 2020 13:13:13 GMT | sent 5 objects to destination file, wrote 5 + Fri, 13 Mar 2020 13:13:13 GMT | got 0 objects from source elasticsearch (offset: 5) + Fri, 13 Mar 2020 13:13:13 GMT | Total Writes: 5 + Fri, 13 Mar 2020 13:13:13 GMT | dump complete + +In order to test restore functionality below I will delete on purpose the +index we backed up, from my instance. + +.. code-block:: console + + $ curl -XDELETE http://localhost:9200/stats-record-view-2020-03 + {"acknowledged":true} + + +Restore with elasticdump +~~~~~~~~~~~~~~~~~~~~~~~~ + +As we are all aware a backup did not work until it gets restored. Note that +before importing our data, we need to import the mappings to re-create the index. +The process is identical with the backup with just reversed sources --input and +--output. + + +.. code-block:: console + + $ elasticdump \ + > --input=record_view_mapping_backup.json \ + > --output=http://localhost:9200/stats-record-view-2020-03 \ + > --type=mapping + + Fri, 13 Mar 2020 15:22:17 GMT | starting dump + Fri, 13 Mar 2020 15:22:17 GMT | got 1 objects from source file (offset: 0) + Fri, 13 Mar 2020 15:22:17 GMT | sent 1 objects to destination elasticsearch, wrote 4 + Fri, 13 Mar 2020 15:22:17 GMT | got 0 objects from source file (offset: 1) + Fri, 13 Mar 2020 15:22:17 GMT | Total Writes: 4 + Fri, 13 Mar 2020 15:22:17 GMT | dump complete + + $ elasticdump \ + > --input=record_view_mapping_backup.json \ + > --output=http://localhost:9200/stats-record-view-2020-03 \ + > --type=mapping + + Fri, 13 Mar 2020 15:23:01 GMT | starting dump + Fri, 13 Mar 2020 15:23:01 GMT | got 5 objects from source file (offset: 0) + Fri, 13 Mar 2020 15:23:01 GMT | sent 5 objects to destination elasticsearch, wrote 5 + Fri, 13 Mar 2020 15:23:01 GMT | got 0 objects from source file (offset: 5) + Fri, 13 Mar 2020 15:23:01 GMT | Total Writes: 5 + Fri, 13 Mar 2020 15:23:01 GMT | dump complete diff --git a/examples/app.py b/examples/app.py index 4071b3c6..2598780c 100644 --- a/examples/app.py +++ b/examples/app.py @@ -127,6 +127,7 @@ def fixtures(): def publish_filedownload(nb_events, user_id, file_key, file_id, bucket_id, date): + """Publish file download event.""" current_stats.publish('file-download', [dict( # When: timestamp=( @@ -143,7 +144,7 @@ def publish_filedownload(nb_events, user_id, file_key, @fixtures.command() def events(): - # Create events + """Create events.""" nb_days = 20 day = datetime(2016, 12, 1, 0, 0, 0) max_events = 10 @@ -162,6 +163,7 @@ def events(): @fixtures.command() def aggregations(): + """Aggregate events.""" aggregate_events(['file-download-agg']) # flush elasticsearch indices so that the aggregations become searchable current_search_client.indices.flush(index='*') diff --git a/invenio_stats/__init__.py b/invenio_stats/__init__.py index 04f98985..4b5fe657 100644 --- a/invenio_stats/__init__.py +++ b/invenio_stats/__init__.py @@ -223,14 +223,14 @@ def register_events(): delete or archive old indices. 2. Aggregating -^^^^^^^^^^^^^^ +~~~~~~~~~~~~~~ The :py:class:`~invenio_stats.processors.EventsIndexer` processor indexes raw events. Querying those events can put a big strain on the Elasticsearch cluster. Thus Invenio-Stats provides a way to *compress* those events by pre-aggregating them into meaningful statistics. -*Example: individual file downoalds events can be aggregated into the number of +*Example: individual file downloads events can be aggregated into the number of file download per day and per file.* Aggregations are registered in the same way as events, under the entrypoint @@ -270,7 +270,7 @@ def register_aggregations(): ] An aggregator class must be specified. The dictionary ``params`` -contains all the arguments given to its construtor. An Aggregator class is +contains all the arguments given to its constructor. An Aggregator class is just required to have a ``run()`` method. The default one is :py:class:`~invenio_stats.aggregations.StatAggregator` @@ -300,7 +300,7 @@ def register_aggregations(): ] } -Again the registering function returns the configuraton for the query: +Again the registering function returns the configuration for the query: .. code-block:: python diff --git a/requirements-devel.txt b/requirements-devel.txt index dd330062..fbe1005c 100644 --- a/requirements-devel.txt +++ b/requirements-devel.txt @@ -14,4 +14,3 @@ -e git+https://github.com/inveniosoftware/invenio-queues.git#egg=invenio-queues -e git+https://github.com/inveniosoftware/invenio-search.git#egg=invenio-search --e git+https://github.com/inveniosoftware/invenio-base.git#egg=invenio-base diff --git a/setup.py b/setup.py index fd4e80b9..a5eb66b2 100644 --- a/setup.py +++ b/setup.py @@ -69,13 +69,14 @@ install_requires = [ 'counter-robots>=2018.6', - 'invenio-base>=1.2.2', + 'Flask>=0.11.1', 'invenio-cache>=1.0.0', 'invenio-celery>=1.1.3', 'invenio-queues>=1.0.0a2', 'maxminddb-geolite2>=2017.0404', 'python-dateutil>=2.6.1', 'python-geoip>=1.2', + 'Werkzeug>=0.15.0, <1.0.0', ] packages = find_packages() diff --git a/tests/conftest.py b/tests/conftest.py index 01b54b22..6aaf17aa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,6 +17,7 @@ import uuid from contextlib import contextmanager from copy import deepcopy +from unittest.mock import Mock, patch # imported to make sure that # login_oauth2_user(valid, oauth) is included @@ -42,7 +43,6 @@ from invenio_records.api import Record from invenio_search import InvenioSearch, current_search, current_search_client from kombu import Exchange -from unittest.mock import Mock, patch from six import BytesIO from sqlalchemy_utils.functions import create_database, database_exists diff --git a/tests/contrib/test_event_builders.py b/tests/contrib/test_event_builders.py index 0df96d17..e58e2e31 100644 --- a/tests/contrib/test_event_builders.py +++ b/tests/contrib/test_event_builders.py @@ -9,7 +9,6 @@ """Test event builders.""" import datetime - from unittest.mock import patch from invenio_stats.contrib.event_builders import file_download_event_builder, \ diff --git a/tests/test_aggregations.py b/tests/test_aggregations.py index 85a4c1b3..d7237ffe 100644 --- a/tests/test_aggregations.py +++ b/tests/test_aggregations.py @@ -9,12 +9,12 @@ """Aggregation tests.""" import datetime +from unittest.mock import patch import pytest from conftest import _create_file_download_event from elasticsearch_dsl import Index, Search from invenio_search import current_search -from unittest.mock import patch from invenio_stats import current_stats from invenio_stats.aggregations import StatAggregator, filter_robots diff --git a/tests/test_processors.py b/tests/test_processors.py index a8b6876f..7bbcacde 100644 --- a/tests/test_processors.py +++ b/tests/test_processors.py @@ -10,6 +10,7 @@ import logging from datetime import datetime +from unittest.mock import patch import pytest from conftest import _create_file_download_event @@ -18,7 +19,6 @@ from helpers import get_queue_size from invenio_queues.proxies import current_queues from invenio_search import current_search -from unittest.mock import patch from invenio_stats.contrib.event_builders import build_file_unique_id, \ file_download_event_builder