From 1a4878ead985692c95d733990111a8eb142707b2 Mon Sep 17 00:00:00 2001 From: Peter Allen Webb Date: Thu, 9 Nov 2023 13:05:24 -0500 Subject: [PATCH 1/3] Performance enhancement. Make extract_toplevel_blocks() faster, especially for large files. --- core/dbt/clients/_jinja_blocks.py | 46 ++++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/core/dbt/clients/_jinja_blocks.py b/core/dbt/clients/_jinja_blocks.py index 1ada0a6234d..d0f560f5408 100644 --- a/core/dbt/clients/_jinja_blocks.py +++ b/core/dbt/clients/_jinja_blocks.py @@ -1,5 +1,7 @@ +import dataclasses import re from collections import namedtuple +from typing import Dict, Optional from dbt.exceptions import ( BlockDefinitionNotAtTopError, @@ -97,13 +99,26 @@ def end_pat(self): QUOTE_START_PATTERN = regex(r"""(?P(['"]))""") +@dataclasses.dataclass +class PositionedMatch: + """Used to accelerate TagIterator. Records the result of searching a string, starting + at start_pos and finding match (or None).""" + + start_pos: int + match: Optional[re.Match] + + class TagIterator: def __init__(self, data): self.data = data - self.blocks = [] - self._parenthesis_stack = [] + self.blocks = [] # unused? + self._parenthesis_stack = [] # unused? self.pos = 0 + # Performance enhancement: A cache of the most recent matches seen for each pattern. + # Includes the start position used for the search. + self._past_matches: Dict[re.Pattern, PositionedMatch] = {} + def linepos(self, end=None) -> str: """Given an absolute position in the input data, return a pair of line number + relative position to the start of the line. @@ -122,8 +137,31 @@ def advance(self, new_position): def rewind(self, amount=1): self.pos -= amount - def _search(self, pattern): - return pattern.search(self.data, self.pos) + def _search(self, pattern) -> Optional[re.Match]: + + # Check to see if we have a cached search on this pattern. + positioned_match = self._past_matches.get(pattern) + + if positioned_match is None or positioned_match.start_pos > self.pos: + # We did not have a cached search, just do one from scratch. + match = pattern.search(self.data, self.pos) + self._past_matches[pattern] = PositionedMatch(self.pos, match) + else: + # We have a cached search and its start position falls before (or at) the + # current search position... + if positioned_match.match is None: + # ...but there is no match in the rest of the 'data'. + match = None + elif positioned_match.match.start() >= self.pos: + # ...and there is a match we can reuse, because we have not yet reached + # the location of the match. + match = positioned_match.match + else: + # ...but we have passed the match, and need to re-search from the position. + match = pattern.search(self.data, self.pos) + self._past_matches[pattern] = PositionedMatch(self.pos, match) + + return match def _match(self, pattern): return pattern.match(self.data, self.pos) From 8addbc87d258587751e9bbfe9a96bb86e2007a8b Mon Sep 17 00:00:00 2001 From: Peter Allen Webb Date: Thu, 9 Nov 2023 13:16:13 -0500 Subject: [PATCH 2/3] Add changelog entry. --- .changes/unreleased/Under the Hood-20231109-131545.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .changes/unreleased/Under the Hood-20231109-131545.yaml diff --git a/.changes/unreleased/Under the Hood-20231109-131545.yaml b/.changes/unreleased/Under the Hood-20231109-131545.yaml new file mode 100644 index 00000000000..69b9dac538b --- /dev/null +++ b/.changes/unreleased/Under the Hood-20231109-131545.yaml @@ -0,0 +1,6 @@ +kind: Under the Hood +body: Make extract_toplevel_blocks() Faster +time: 2023-11-09T13:15:45.338059-05:00 +custom: + Author: peterallenwebb qmalcolm + Issue: "9037" From 22d4c8265204d679828e14871db6c9ab1d0c3f14 Mon Sep 17 00:00:00 2001 From: Peter Allen Webb Date: Thu, 9 Nov 2023 15:28:52 -0500 Subject: [PATCH 3/3] Clarify comments. --- core/dbt/clients/_jinja_blocks.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/core/dbt/clients/_jinja_blocks.py b/core/dbt/clients/_jinja_blocks.py index d0f560f5408..f1cd42d488d 100644 --- a/core/dbt/clients/_jinja_blocks.py +++ b/core/dbt/clients/_jinja_blocks.py @@ -143,7 +143,8 @@ def _search(self, pattern) -> Optional[re.Match]: positioned_match = self._past_matches.get(pattern) if positioned_match is None or positioned_match.start_pos > self.pos: - # We did not have a cached search, just do one from scratch. + # We did not have a cached search, or we did, but it was done at a location + # further along in the string. Do a new search and cache it. match = pattern.search(self.data, self.pos) self._past_matches[pattern] = PositionedMatch(self.pos, match) else: @@ -153,11 +154,12 @@ def _search(self, pattern) -> Optional[re.Match]: # ...but there is no match in the rest of the 'data'. match = None elif positioned_match.match.start() >= self.pos: - # ...and there is a match we can reuse, because we have not yet reached - # the location of the match. + # ...and there is a match we can reuse, because we have not yet passed + # the start position of the match. It's still the next match. match = positioned_match.match else: - # ...but we have passed the match, and need to re-search from the position. + # ...but we have passed the start of the cached match, and need to do a + # new search from our current position and cache it. match = pattern.search(self.data, self.pos) self._past_matches[pattern] = PositionedMatch(self.pos, match)