From d95e03f0899628449771ed6d830837382c834ef3 Mon Sep 17 00:00:00 2001 From: Peter Bengtsson Date: Wed, 14 Apr 2021 18:28:31 -0400 Subject: [PATCH] Use Elasticsearch index aliases to avoid search downtime (#3409) * Use ElasticSearch index aliases to avoid search downtime Fixes #3098 * Use ElasticSearch index aliases to avoid search downtime Part of #3098 (the reason this PR doesn't close issue #3098) is because the `prod-build.yml` still uses `--update`) * flake8 * Apply suggestions from code review Co-authored-by: Ryan Johnson * remove 'already' * feedbacked * cleaning up * cope with --update the first time ever * Apply suggestions from code review Co-authored-by: Ryan Johnson * feedbacked Co-authored-by: Ryan Johnson --- .github/workflows/prod-build.yml | 11 +- .github/workflows/stage-build.yml | 11 +- deployer/README.md | 44 ++++--- deployer/src/deployer/main.py | 7 -- deployer/src/deployer/search/__init__.py | 147 +++++++++++++++-------- deployer/src/deployer/search/models.py | 12 +- 6 files changed, 136 insertions(+), 96 deletions(-) diff --git a/.github/workflows/prod-build.yml b/.github/workflows/prod-build.yml index ee42d1fc7ad2..c7d3268ddcbf 100644 --- a/.github/workflows/prod-build.yml +++ b/.github/workflows/prod-build.yml @@ -231,16 +231,7 @@ jobs: # poetry run deployer upload --prune --archived-files ../content/archived.txt ../client/build poetry run deployer upload ../client/build poetry run deployer update-lambda-functions ./aws-lambda - - # TODO: Depending on how long the upload takes, consider switching to - # use `--update` in this command. It will skip the index re-creation - # and "cake on". This is slightly faster and means no window of time - # where the Elasticsearch server is nearly empty. - # Consider doing something like a clean indexing on Sundays and - # use `--update` on all the other week days. - # poetry run deployer search-index ../client/build --priority-prefix=en-us/docs/web - # TEMPORARY - poetry run deployer search-index ../client/build --update + poetry run deployer search-index ../client/build # Record the deployment in our Speedcurve account. This should always # be done after the upload of the documents and Lambda functions, as well diff --git a/.github/workflows/stage-build.yml b/.github/workflows/stage-build.yml index dcb1f416bb3c..f2d22cd80015 100644 --- a/.github/workflows/stage-build.yml +++ b/.github/workflows/stage-build.yml @@ -237,16 +237,7 @@ jobs: # poetry run deployer upload --prune --archived-files ../content/archived.txt ../client/build poetry run deployer upload ../client/build poetry run deployer update-lambda-functions ./aws-lambda - - # TODO: Depending on how long the upload takes, consider switching to - # use `--update` in this command. It will skip the index re-creation - # and "cake on". This is slightly faster and means no window of time - # where the Elasticsearch server is nearly empty. - # Consider doing something like a clean indexing on Sundays and - # use `--update` on all the other week days. - # poetry run deployer search-index ../client/build --priority-prefix=en-us/docs/web - # TEMPORARY - poetry run deployer search-index ../client/build --update + poetry run deployer search-index ../client/build # Record the deployment in our Speedcurve account. This should always # be done after the upload of the documents and Lambda functions, as well diff --git a/deployer/README.md b/deployer/README.md index 7cedfde95fe2..9fd00e7ab7de 100644 --- a/deployer/README.md +++ b/deployer/README.md @@ -131,6 +131,30 @@ This is to make it convenient in GitHub Actions to control the execution purely based on the presence of the environment variable. +### About Elasticsearch aliases + +The default behavior is that each day you get a different index name. +E.g. `mdn_docs_20210331093714`. And then there's an alias with a more "generic" name. +E.g. `mdn_docs`. It's the alias name that Kuma uses to send search queries to. + +The way indexing works is that we leave the existing index and its alias in place, +then we fill up a new index and once that works, we atomically "move the alias" and +delete the old index. To demonstrate, consider this example timeline: + +- Yesterday: index `mdn_docs_20210330093714` and `mdn_docs --> mdn_docs_20210330093714` +- Today: + - create new index `mdn_docs_20210331094500` + - populate `mdn_docs_20210331094500` (could take a long time) + - atomically re-assign alias `mdn_docs --> mdn_docs_20210331094500` and delete old index `mdn_docs_20210330093714` + - delete old index `mdn_docs_20210330` + +Note, this only applies if you _don't_ use `--update`. +If you use `--update` it will just keep adding to the existing index whose +name is based on today's date. + +What this means it that **there is zero downtime for the search queries**. Nothing +needs to be reconfigured on the Kuma side. + ### To update or not start a fresh The default behavior is that it deletes the index first and immediately creates @@ -148,26 +172,6 @@ only from time to time omit it for a fresh new start. But note, if you omit the `--update` (i.e. recreating the index), search will work. It just may find less that it finds when it's fully indexed. -### Priority prefixes - -When you index without `--update` it will delete and recreate the index. -That means that during the time you're indexing, the searches that are happening -concurrently will not find much. That's probably OK in most cases but you -can adjust the priority of what gets indexed first. This has the advantage -that most searches, that are expecting to find content in the popular document, -will get something useful while the indexing is going on. - -To set up one or multiple priority prefixes use the `--priority-prefixes` -(or just `-p` for short). Best described with an example: - -```sh -poetry run deployer search-index ../client/build -p en-us/docs/web -p en-us/docs -``` - -This will first index all the files whose `index.json` file path (relative -to the root) matches `en-us/docs/web`. Then it does all that match `en-us/docs`. -And lastly, it does all the files that don't match any of the prefixes. - ## Analyze PR builds When you've built files you can analyze those built files to produce a Markdown diff --git a/deployer/src/deployer/main.py b/deployer/src/deployer/main.py index 04f3052c4913..4659859641ed 100644 --- a/deployer/src/deployer/main.py +++ b/deployer/src/deployer/main.py @@ -337,12 +337,6 @@ def speedcurve_deploy(ctx, **kwargs): default=CI, show_default=True, ) -@click.option( - "--priority-prefix", - "-p", - multiple=True, - help="Specific folder prefixes to index first.", -) @click.argument("buildroot", type=click.Path(), callback=validate_directory) @click.pass_context def search_index(ctx, buildroot: Path, **kwargs): @@ -360,7 +354,6 @@ def search_index(ctx, buildroot: Path, **kwargs): url, update=kwargs["update"], no_progressbar=kwargs["no_progressbar"], - priority_prefixes=kwargs["priority_prefix"], ) diff --git a/deployer/src/deployer/search/__init__.py b/deployer/src/deployer/search/__init__.py index b25047d68a1f..4c6be2e0e28a 100644 --- a/deployer/src/deployer/search/__init__.py +++ b/deployer/src/deployer/search/__init__.py @@ -2,14 +2,19 @@ import re import time from pathlib import Path -from collections import defaultdict, Counter +from collections import Counter import click from elasticsearch.helpers import streaming_bulk +from elasticsearch_dsl import Index from elasticsearch_dsl.connections import connections from selectolax.parser import HTMLParser -from .models import Document +from .models import Document, INDEX_ALIAS_NAME + + +class IndexAliasError(Exception): + """When there's something wrong with finding the index alias.""" def index( @@ -17,7 +22,6 @@ def index( url: str, update=False, no_progressbar=False, - priority_prefixes: (str) = (), ): # We can confidently use a single host here because we're not searching # a cluster. @@ -34,48 +38,51 @@ def index( click.echo(f"Found {count_todo:,} (potential) documents to index") - # Confusingly, `._index` is actually not a private API. - # It's the documented way you're supposed to reach it. - document_index = Document._index - if not update: + if update: + index_name = None + has_old_index = False + for name in connection.indices.get_alias(): + if name.startswith("mdn_docs_"): + index_name = name + break + elif name == INDEX_ALIAS_NAME: + has_old_index = True + else: + if not has_old_index: + raise IndexAliasError("Unable to find an index called mdn_docs_*") + + document_index = Index(index_name) + else: + # Confusingly, `._index` is actually not a private API. + # It's the documented way you're supposed to reach it. + document_index = Document._index click.echo( "Deleting any possible existing index " - f"and creating a new one called {document_index._name}" + f"and creating a new one called {document_index._name!r}" ) document_index.delete(ignore=404) document_index.create() - search_prefixes = [None] - for prefix in reversed(priority_prefixes): - search_prefixes.insert(0, prefix) - - count_by_prefix = defaultdict(int) - - already = set() - skipped = [] def generator(): - for prefix in search_prefixes: - root = Path(buildroot) - if prefix: - root /= prefix - for doc in walk(root): - if doc in already: - continue - already.add(doc) - search_doc = to_search(doc) - if search_doc: - count_by_prefix[prefix] += 1 - yield search_doc.to_dict(True) - else: - # The reason something might be chosen to be skipped is because - # there's logic that kicks in only when the `index.json` file - # has been opened and parsed. - # Keep a count of all of these. It's used to make sure the - # progressbar, if used, ticks as many times as the estimate - # count was. - skipped.append(1) + root = Path(buildroot) + for doc in walk(root): + # The reason for specifying the exact index name is that we might + # be doing an update and if you don't specify it, elasticsearch_dsl + # will fall back to using whatever Document._meta.Index automatically + # becomes in this moment. + search_doc = to_search(doc, _index=document_index._name) + if search_doc: + yield search_doc.to_dict(True) + else: + # The reason something might be chosen to be skipped is because + # there's logic that kicks in only when the `index.json` file + # has been opened and parsed. + # Keep a count of all of these. It's used to make sure the + # progressbar, if used, ticks as many times as the estimate + # count was. + skipped.append(1) def get_progressbar(): if no_progressbar: @@ -90,7 +97,6 @@ def get_progressbar(): for success, info in streaming_bulk( connection, generator(), - index=document_index._name, # It's an inexact science as to what the perfect chunk_size should be. # The default according to # https://elasticsearch-py.readthedocs.io/en/v7.12.0/helpers.html#elasticsearch.helpers.streaming_bulk @@ -122,6 +128,56 @@ def get_progressbar(): for skip in skipped: bar.update(1) + # Now when the index has been filled, we need to make sure we + # correct any previous indexes. + if update: + # When you do an update, Elasticsearch will internally delete the + # previous docs (based on the _id primary key we set). + # Normally, Elasticsearch will do this when you restart the cluster + # but that's not something we usually do. + # See https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-forcemerge.html + if not has_old_index: + document_index.forcemerge() + else: + # (Apr 2021) In the olden days, before using aliases, we used to call + # the index what is today the index. We have to delete that index + # to make for using that name as the alias. + # This code can be deleted any time after it's been useful once + # in all deployments. + legacy_index = Index(INDEX_ALIAS_NAME) + if legacy_index.exists(): + # But `.exists()` will be true if it's an alias as well! It basically + # answers: "Yes, it's an index or an alias". + # We only want to delete it if it's an index. The test for that + # is to see if it does *not* have an alias. + if ( + INDEX_ALIAS_NAME in legacy_index.get_alias() + and not legacy_index.get_alias()[INDEX_ALIAS_NAME]["aliases"] + ): + click.echo( + f"Delete the old {INDEX_ALIAS_NAME!r} index from when it was " + "an actual index." + ) + legacy_index.delete(ignore=404) + + # Now we're going to bundle the change to set the alias to point + # to the new index and delete all old indexes. + # The reason for doing this together in one update is to make it atomic. + alias_updates = [ + {"add": {"index": document_index._name, "alias": INDEX_ALIAS_NAME}} + ] + for index_name in connection.indices.get_alias(): + if index_name.startswith("mdn_docs_"): + if index_name != document_index._name: + alias_updates.append({"remove_index": {"index": index_name}}) + click.echo(f"Delete old index {index_name!r}") + + connection.indices.update_aliases({"actions": alias_updates}) + click.echo( + f"Reassign the {INDEX_ALIAS_NAME!r} alias from old index " + f"to {document_index._name}" + ) + t1 = time.time() took = t1 - t0 rate = count_done / took @@ -139,15 +195,6 @@ def get_progressbar(): for error, count in errors_counter.most_common(): click.echo(f"{count:,}\t{error[:80]}") - if priority_prefixes: - click.echo("Counts per priority prefixes:") - rest = sum(v for v in count_by_prefix.values()) - for prefix in priority_prefixes: - click.echo(f"\t{prefix:<30} {count_by_prefix[prefix]:,}") - rest -= count_by_prefix[prefix] - prefix = "*rest*" - click.echo(f"\t{prefix:<30} {rest:,}") - class VoidProgressBar: def __enter__(self): @@ -184,7 +231,7 @@ def walk(root): yield path -def to_search(file): +def to_search(file, _index=None): with open(file) as f: data = json.load(f) if "doc" not in data: @@ -207,7 +254,8 @@ def to_search(file): # files. return locale = locale[1:] - return Document( + d = Document( + _index=_index, _id=doc["mdn_url"], title=doc["title"], # This is confusing. But it's worth hacking around for now until @@ -268,6 +316,9 @@ def to_search(file): slug=slug.lower(), locale=locale.lower(), ) + # print(dir(d)) + # raise Exception + return d _display_none_regex = re.compile(r"display:\s*none") diff --git a/deployer/src/deployer/search/models.py b/deployer/src/deployer/search/models.py index 980f76d04f7a..fc5f5b51d447 100644 --- a/deployer/src/deployer/search/models.py +++ b/deployer/src/deployer/search/models.py @@ -1,3 +1,5 @@ +import datetime + from elasticsearch_dsl import ( Boolean, Document as ESDocument, @@ -9,6 +11,12 @@ char_filter, ) +# Note, this is the name that the Kuma code will use when sending Elasticsearch +# search queries. +# We always build an index that is called something based on this name but with +# the _YYYYMMDDHHMMSS date suffix. +INDEX_ALIAS_NAME = "mdn_docs" + """ A great way to debug analyzers is with the `Document._index.analyze()` API which is a convenient abstraction for the Elasticsearch Analyze API @@ -172,4 +180,6 @@ class Document(ESDocument): popularity = Float() class Index: - name = "mdn_docs" + name = ( + f'{INDEX_ALIAS_NAME}_{datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")}' + )